diff --git a/clang/include/clang/AST/ASTImporter.h b/clang/include/clang/AST/ASTImporter.h
index 4ffd9138465757..f851decd0965ce 100644
--- a/clang/include/clang/AST/ASTImporter.h
+++ b/clang/include/clang/AST/ASTImporter.h
@@ -258,7 +258,6 @@ class TypeSourceInfo;
     FoundDeclsTy findDeclsInToCtx(DeclContext *DC, DeclarationName Name);
 
     void AddToLookupTable(Decl *ToD);
-    llvm::Error ImportAttrs(Decl *ToD, Decl *FromD);
 
   protected:
     /// Can be overwritten by subclasses to implement their own import logic.
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index fb52ac804849d8..3a110454f29eda 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1188,10 +1188,6 @@ class CXXRecordDecl : public RecordDecl {
   ///
   /// \note This does NOT include a check for union-ness.
   bool isEmpty() const { return data().Empty; }
-  /// Marks this record as empty. This is used by DWARFASTParserClang
-  /// when parsing records with empty fields having [[no_unique_address]]
-  /// attribute
-  void markEmpty() { data().Empty = true; }
 
   void setInitMethod(bool Val) { data().HasInitMethod = Val; }
   bool hasInitMethod() const { return data().HasInitMethod; }
diff --git a/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
index 420f13ce11bfde..5de66fcb0e3afb 100644
--- a/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
+++ b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
@@ -18,6 +18,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
@@ -27,6 +28,24 @@
 namespace clang {
 namespace dataflow {
 
+namespace internal {
+class StmtToBlockMap {
+public:
+  StmtToBlockMap(const CFG &Cfg);
+
+  const CFGBlock *lookup(const Stmt &S) const {
+    return StmtToBlock.lookup(&ignoreCFGOmittedNodes(S));
+  }
+
+  const llvm::DenseMap<const Stmt *, const CFGBlock *> &getMap() const {
+    return StmtToBlock;
+  }
+
+private:
+  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+};
+} // namespace internal
+
 /// Holds CFG with additional information derived from it that is needed to
 /// perform dataflow analysis.
 class AdornedCFG {
@@ -49,8 +68,17 @@ class AdornedCFG {
   const CFG &getCFG() const { return *Cfg; }
 
   /// Returns a mapping from statements to basic blocks that contain them.
+  /// Deprecated. Use `blockForStmt()` instead (which prevents the potential
+  /// error of forgetting to call `ignoreCFGOmittedNodes()` on the statement to
+  /// look up).
   const llvm::DenseMap<const Stmt *, const CFGBlock *> &getStmtToBlock() const {
-    return StmtToBlock;
+    return StmtToBlock.getMap();
+  }
+
+  /// Returns the basic block that contains `S`, or null if no basic block
+  /// containing `S` is found.
+  const CFGBlock *blockForStmt(const Stmt &S) const {
+    return StmtToBlock.lookup(S);
   }
 
   /// Returns whether `B` is reachable from the entry block.
@@ -73,8 +101,7 @@ class AdornedCFG {
 private:
   AdornedCFG(
       const Decl &D, std::unique_ptr<CFG> Cfg,
-      llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock,
-      llvm::BitVector BlockReachable,
+      internal::StmtToBlockMap StmtToBlock, llvm::BitVector BlockReachable,
       llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock)
       : ContainingDecl(D), Cfg(std::move(Cfg)),
         StmtToBlock(std::move(StmtToBlock)),
@@ -85,7 +112,7 @@ class AdornedCFG {
   /// The `Decl` containing the statement used to construct the CFG.
   const Decl &ContainingDecl;
   std::unique_ptr<CFG> Cfg;
-  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+  internal::StmtToBlockMap StmtToBlock;
   llvm::BitVector BlockReachable;
   llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock;
 };
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 12a4617c64d87e..8a1462c670d68f 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -288,6 +288,9 @@ def err_function_needs_feature : Error<
 let CategoryName = "Codegen ABI Check" in {
 def err_function_always_inline_attribute_mismatch : Error<
   "always_inline function %1 and its caller %0 have mismatching %2 attributes">;
+def warn_function_always_inline_attribute_mismatch : Warning<
+  "always_inline function %1 and its caller %0 have mismatching %2 attributes, "
+  "inlining may change runtime behaviour">, InGroup<AArch64SMEAttributes>;
 def err_function_always_inline_new_za : Error<
   "always_inline function %0 has new za state">;
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index fba1982cf78714..93a9fe98d1d4a8 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -980,15 +980,15 @@ def Wsystem_headers_in_module_EQ : Joined<["-"], "Wsystem-headers-in-module=">,
   HelpText<"Enable -Wsystem-headers when building <module>">,
   MarshallingInfoStringVector<DiagnosticOpts<"SystemHeaderWarningsModules">>;
 def Wdeprecated : Flag<["-"], "Wdeprecated">, Group<W_Group>,
-  Visibility<[ClangOption, CC1Option]>,
+  Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Enable warnings for deprecated constructs and define __DEPRECATED">;
 def Wno_deprecated : Flag<["-"], "Wno-deprecated">, Group<W_Group>,
   Visibility<[ClangOption, CC1Option]>;
 defm invalid_constexpr : BoolWOption<"invalid-constexpr",
   LangOpts<"CheckConstexprFunctionBodies">,
   Default<!strconcat("!", cpp23.KeyPath)>,
-  NegFlag<SetFalse, [], [ClangOption, CC1Option], "Disable">,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
+  NegFlag<SetFalse, [HelpHidden], [ClangOption, CC1Option], "Disable">,
+  PosFlag<SetTrue, [HelpHidden], [ClangOption, CC1Option], "Enable">,
   BothFlags<[], [ClangOption, CC1Option], " checking of constexpr function bodies for validity within a constant expression context">>;
 def Wl_COMMA : CommaJoined<["-"], "Wl,">, Visibility<[ClangOption, FlangOption]>,
   Flags<[LinkerInput, RenderAsInput]>,
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 08ef09d353afc9..da1981d8dd05f1 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -4179,12 +4179,6 @@ ExpectedDecl ASTNodeImporter::VisitFieldDecl(FieldDecl *D) {
                               D->getInClassInitStyle()))
     return ToField;
 
-  // We need [[no_unqiue_address]] attributes to be added to FieldDecl, before
-  // we add fields in CXXRecordDecl::addedMember, otherwise record will be
-  // marked as having non-zero size.
-  Err = Importer.ImportAttrs(ToField, D);
-  if (Err)
-    return std::move(Err);
   ToField->setAccess(D->getAccess());
   ToField->setLexicalDeclContext(LexicalDC);
   ToField->setImplicit(D->isImplicit());
@@ -9399,19 +9393,6 @@ TranslationUnitDecl *ASTImporter::GetFromTU(Decl *ToD) {
   return FromDPos->second->getTranslationUnitDecl();
 }
 
-Error ASTImporter::ImportAttrs(Decl *ToD, Decl *FromD) {
-  if (!FromD->hasAttrs() || ToD->hasAttrs())
-    return Error::success();
-  for (const Attr *FromAttr : FromD->getAttrs()) {
-    auto ToAttrOrErr = Import(FromAttr);
-    if (ToAttrOrErr)
-      ToD->addAttr(*ToAttrOrErr);
-    else
-      return ToAttrOrErr.takeError();
-  }
-  return Error::success();
-}
-
 Expected<Decl *> ASTImporter::Import(Decl *FromD) {
   if (!FromD)
     return nullptr;
@@ -9545,8 +9526,15 @@ Expected<Decl *> ASTImporter::Import(Decl *FromD) {
   }
   // Make sure that ImportImpl registered the imported decl.
   assert(ImportedDecls.count(FromD) != 0 && "Missing call to MapImported?");
-  if (auto Error = ImportAttrs(ToD, FromD))
-    return std::move(Error);
+
+  if (FromD->hasAttrs())
+    for (const Attr *FromAttr : FromD->getAttrs()) {
+      auto ToAttrOrErr = Import(FromAttr);
+      if (ToAttrOrErr)
+        ToD->addAttr(*ToAttrOrErr);
+      else
+        return ToAttrOrErr.takeError();
+    }
 
   // Notify subclasses.
   Imported(FromD, ToD);
diff --git a/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
index 255543021a998c..876b5a3db52498 100644
--- a/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
+++ b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
@@ -16,6 +16,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
@@ -96,8 +97,7 @@ static llvm::BitVector findReachableBlocks(const CFG &Cfg) {
 
 static llvm::DenseSet<const CFGBlock *>
 buildContainsExprConsumedInDifferentBlock(
-    const CFG &Cfg,
-    const llvm::DenseMap<const Stmt *, const CFGBlock *> &StmtToBlock) {
+    const CFG &Cfg, const internal::StmtToBlockMap &StmtToBlock) {
   llvm::DenseSet<const CFGBlock *> Result;
 
   auto CheckChildExprs = [&Result, &StmtToBlock](const Stmt *S,
@@ -105,7 +105,7 @@ buildContainsExprConsumedInDifferentBlock(
     for (const Stmt *Child : S->children()) {
       if (!isa_and_nonnull<Expr>(Child))
         continue;
-      const CFGBlock *ChildBlock = StmtToBlock.lookup(Child);
+      const CFGBlock *ChildBlock = StmtToBlock.lookup(*Child);
       if (ChildBlock != Block)
         Result.insert(ChildBlock);
     }
@@ -126,6 +126,13 @@ buildContainsExprConsumedInDifferentBlock(
   return Result;
 }
 
+namespace internal {
+
+StmtToBlockMap::StmtToBlockMap(const CFG &Cfg)
+    : StmtToBlock(buildStmtToBasicBlockMap(Cfg)) {}
+
+} // namespace internal
+
 llvm::Expected<AdornedCFG> AdornedCFG::build(const FunctionDecl &Func) {
   if (!Func.doesThisDeclarationHaveABody())
     return llvm::createStringError(
@@ -166,8 +173,7 @@ llvm::Expected<AdornedCFG> AdornedCFG::build(const Decl &D, Stmt &S,
         std::make_error_code(std::errc::invalid_argument),
         "CFG::buildCFG failed");
 
-  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock =
-      buildStmtToBasicBlockMap(*Cfg);
+  internal::StmtToBlockMap StmtToBlock(*Cfg);
 
   llvm::BitVector BlockReachable = findReachableBlocks(*Cfg);
 
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 3c896d373a211d..9c54eb16d22246 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -40,17 +40,16 @@ namespace clang {
 namespace dataflow {
 
 const Environment *StmtToEnvMap::getEnvironment(const Stmt &S) const {
-  auto BlockIt = ACFG.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
-  if (BlockIt == ACFG.getStmtToBlock().end()) {
+  const CFGBlock *Block = ACFG.blockForStmt(S);
+  if (Block == nullptr) {
     assert(false);
-    // Return null to avoid dereferencing the end iterator in non-assert builds.
     return nullptr;
   }
-  if (!ACFG.isBlockReachable(*BlockIt->getSecond()))
+  if (!ACFG.isBlockReachable(*Block))
     return nullptr;
-  if (BlockIt->getSecond()->getBlockID() == CurBlockID)
+  if (Block->getBlockID() == CurBlockID)
     return &CurState.Env;
-  const auto &State = BlockToState[BlockIt->getSecond()->getBlockID()];
+  const auto &State = BlockToState[Block->getBlockID()];
   if (!(State))
     return nullptr;
   return &State->Env;
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 200682faafd6ab..8afd18b315d286 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -243,10 +243,11 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
     // See `NoreturnDestructorTest` for concrete examples.
     if (Block.succ_begin()->getReachableBlock() != nullptr &&
         Block.succ_begin()->getReachableBlock()->hasNoReturnElement()) {
-      auto &StmtToBlock = AC.ACFG.getStmtToBlock();
-      auto StmtBlock = StmtToBlock.find(Block.getTerminatorStmt());
-      assert(StmtBlock != StmtToBlock.end());
-      llvm::erase(Preds, StmtBlock->getSecond());
+      const CFGBlock *StmtBlock = nullptr;
+      if (const Stmt *Terminator = Block.getTerminatorStmt())
+        StmtBlock = AC.ACFG.blockForStmt(*Terminator);
+      assert(StmtBlock != nullptr);
+      llvm::erase(Preds, StmtBlock);
     }
   }
 
diff --git a/clang/lib/Analysis/LiveVariables.cpp b/clang/lib/Analysis/LiveVariables.cpp
index 6d03dd05ca3d27..481932ee59c8ee 100644
--- a/clang/lib/Analysis/LiveVariables.cpp
+++ b/clang/lib/Analysis/LiveVariables.cpp
@@ -214,6 +214,22 @@ static void AddLiveExpr(llvm::ImmutableSet<const Expr *> &Set,
   Set = F.add(Set, LookThroughExpr(E));
 }
 
+/// Add as a live expression all individual conditions in a logical expression.
+/// For example, for the expression:
+/// "(a < b) || (c && d && ((e || f) != (g && h)))"
+/// the following expressions will be added as live:
+/// "a < b", "c", "d", "((e || f) != (g && h))"
+static void AddAllConditionalTerms(llvm::ImmutableSet<const Expr *> &Set,
+                                   llvm::ImmutableSet<const Expr *>::Factory &F,
+                                   const Expr *Cond) {
+  AddLiveExpr(Set, F, Cond);
+  if (auto const *BO = dyn_cast<BinaryOperator>(Cond->IgnoreParens());
+      BO && BO->isLogicalOp()) {
+    AddAllConditionalTerms(Set, F, BO->getLHS());
+    AddAllConditionalTerms(Set, F, BO->getRHS());
+  }
+}
+
 void TransferFunctions::Visit(Stmt *S) {
   if (observer)
     observer->observeStmt(S, currentBlock, val);
@@ -313,7 +329,27 @@ void TransferFunctions::Visit(Stmt *S) {
       AddLiveExpr(val.liveExprs, LV.ESetFact, cast<ForStmt>(S)->getCond());
       return;
     }
-
+    case Stmt::ConditionalOperatorClass: {
+      // Keep not only direct children alive, but also all the short-circuited
+      // parts of the condition. Short-circuiting evaluation may cause the
+      // conditional operator evaluation to skip the evaluation of the entire
+      // condtion expression, so the value of the entire condition expression is
+      // never computed.
+      //
+      // This makes a difference when we compare exploded nodes coming from true
+      // and false expressions with no side effects: the only difference in the
+      // state is the value of (part of) the condition.
+      //
+      // BinaryConditionalOperatorClass ('x ?: y') is not affected because it
+      // explicitly calculates the value of the entire condition expression (to
+      // possibly use as a value for the "true expr") even if it is
+      // short-circuited.
+      auto const *CO = cast<ConditionalOperator>(S);
+      AddAllConditionalTerms(val.liveExprs, LV.ESetFact, CO->getCond());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, CO->getTrueExpr());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, CO->getFalseExpr());
+      return;
+    }
   }
 
   // HACK + FIXME: What is this? One could only guess that this is an attempt to
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 86df625c9dae99..3876d89585535d 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1993,7 +1993,12 @@ CGDebugInfo::getOrCreateMethodType(const CXXMethodDecl *Method,
   if (Method->isStatic())
     return cast_or_null<llvm::DISubroutineType>(
         getOrCreateType(QualType(Func, 0), Unit));
-  return getOrCreateInstanceMethodType(Method->getThisType(), Func, Unit);
+
+  QualType ThisType;
+  if (!Method->hasCXXExplicitFunctionObjectParameter())
+    ThisType = Method->getThisType();
+
+  return getOrCreateInstanceMethodType(ThisType, Func, Unit);
 }
 
 llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
@@ -2025,27 +2030,31 @@ llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
   Elts.push_back(Args[0]);
 
   // "this" pointer is always first argument.
-  const CXXRecordDecl *RD = ThisPtr->getPointeeCXXRecordDecl();
-  if (isa<ClassTemplateSpecializationDecl>(RD)) {
-    // Create pointer type directly in this case.
-    const PointerType *ThisPtrTy = cast<PointerType>(ThisPtr);
-    uint64_t Size = CGM.getContext().getTypeSize(ThisPtrTy);
-    auto Align = getTypeAlignIfRequired(ThisPtrTy, CGM.getContext());
-    llvm::DIType *PointeeType =
-        getOrCreateType(ThisPtrTy->getPointeeType(), Unit);
-    llvm::DIType *ThisPtrType =
-        DBuilder.createPointerType(PointeeType, Size, Align);
-    TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
-    // TODO: This and the artificial type below are misleading, the
-    // types aren't artificial the argument is, but the current
-    // metadata doesn't represent that.
-    ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
-    Elts.push_back(ThisPtrType);
-  } else {
-    llvm::DIType *ThisPtrType = getOrCreateType(ThisPtr, Unit);
-    TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
-    ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
-    Elts.push_back(ThisPtrType);
+  // ThisPtr may be null if the member function has an explicit 'this'
+  // parameter.
+  if (!ThisPtr.isNull()) {
+    const CXXRecordDecl *RD = ThisPtr->getPointeeCXXRecordDecl();
+    if (isa<ClassTemplateSpecializationDecl>(RD)) {
+      // Create pointer type directly in this case.
+      const PointerType *ThisPtrTy = cast<PointerType>(ThisPtr);
+      uint64_t Size = CGM.getContext().getTypeSize(ThisPtrTy);
+      auto Align = getTypeAlignIfRequired(ThisPtrTy, CGM.getContext());
+      llvm::DIType *PointeeType =
+          getOrCreateType(ThisPtrTy->getPointeeType(), Unit);
+      llvm::DIType *ThisPtrType =
+          DBuilder.createPointerType(PointeeType, Size, Align);
+      TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
+      // TODO: This and the artificial type below are misleading, the
+      // types aren't artificial the argument is, but the current
+      // metadata doesn't represent that.
+      ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
+      Elts.push_back(ThisPtrType);
+    } else {
+      llvm::DIType *ThisPtrType = getOrCreateType(ThisPtr, Unit);
+      TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
+      ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
+      Elts.push_back(ThisPtrType);
+    }
   }
 
   // Copy rest of the arguments.
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index b9df54b0c67c40..1dec3cd40ebd21 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -883,8 +883,10 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABIStreaming(
 
   if (!CalleeIsStreamingCompatible &&
       (CallerIsStreaming != CalleeIsStreaming || CallerIsStreamingCompatible))
-    CGM.getDiags().Report(CallLoc,
-                          diag::err_function_always_inline_attribute_mismatch)
+    CGM.getDiags().Report(
+        CallLoc, CalleeIsStreaming
+                     ? diag::err_function_always_inline_attribute_mismatch
+                     : diag::warn_function_always_inline_attribute_mismatch)
         << Caller->getDeclName() << Callee->getDeclName() << "streaming";
   if (auto *NewAttr = Callee->getAttr<ArmNewAttr>())
     if (NewAttr->isNewZA())
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 7ebc4871e323d8..42930cd106d060 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -342,6 +342,9 @@ void Flang::AddAMDGPUTargetArgs(const ArgList &Args,
     StringRef Val = A->getValue();
     CmdArgs.push_back(Args.MakeArgString("-mcode-object-version=" + Val));
   }
+
+  const ToolChain &TC = getToolChain();
+  TC.addClangTargetOptions(Args, CmdArgs, Action::OffloadKind::OFK_OpenMP);
 }
 
 void Flang::addTargetOptions(const ArgList &Args,
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 112cf3d0818227..7389046eaddde1 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -825,7 +825,6 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
   if (auto *CCE = dyn_cast<CXXConstructExpr>(Init)) {
     if (CCE->getConstructor()->isCopyOrMoveConstructor()) {
       if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(CCE->getArg(0))) {
-        // assert(false && "hit temporary copy path");
         Expr *Arg = MTE->getSubExpr();
         Path.push_back({IndirectLocalPathEntry::TemporaryCopy, Arg,
                         CCE->getConstructor()});
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 0602d07c6b9b0d..545da21183c3c4 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -39,6 +39,7 @@
 #include "clang/Sema/Overload.h"
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Scope.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -241,11 +242,10 @@ NamedDecl *buildDeductionGuide(
 }
 
 // Transform a given template type parameter `TTP`.
-TemplateTypeParmDecl *
-transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC,
-                           TemplateTypeParmDecl *TTP,
-                           MultiLevelTemplateArgumentList &Args,
-                           unsigned NewDepth, unsigned NewIndex) {
+TemplateTypeParmDecl *transformTemplateTypeParam(
+    Sema &SemaRef, DeclContext *DC, TemplateTypeParmDecl *TTP,
+    MultiLevelTemplateArgumentList &Args, unsigned NewDepth, unsigned NewIndex,
+    bool EvaluateConstraint) {
   // TemplateTypeParmDecl's index cannot be changed after creation, so
   // substitute it directly.
   auto *NewTTP = TemplateTypeParmDecl::Create(
@@ -257,7 +257,7 @@ transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC,
           : std::nullopt);
   if (const auto *TC = TTP->getTypeConstraint())
     SemaRef.SubstTypeConstraint(NewTTP, TC, Args,
-                                /*EvaluateConstraint=*/true);
+                                /*EvaluateConstraint=*/EvaluateConstraint);
   if (TTP->hasDefaultArgument()) {
     TemplateArgumentLoc InstantiatedDefaultArg;
     if (!SemaRef.SubstTemplateArgument(
@@ -284,6 +284,22 @@ transformTemplateParam(Sema &SemaRef, DeclContext *DC,
   return NewParam;
 }
 
+NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC,
+                                      NamedDecl *TemplateParam,
+                                      MultiLevelTemplateArgumentList &Args,
+                                      unsigned NewIndex, unsigned NewDepth,
+                                      bool EvaluateConstraint = true) {
+  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
+    return transformTemplateTypeParam(
+        SemaRef, DC, TTP, Args, NewDepth, NewIndex,
+        /*EvaluateConstraint=*/EvaluateConstraint);
+  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
+    return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth);
+  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
+    return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth);
+  llvm_unreachable("Unhandled template parameter types");
+}
+
 /// Transform to convert portions of a constructor declaration into the
 /// corresponding deduction guide, per C++1z [over.match.class.deduct]p1.
 struct ConvertConstructorToDeductionGuideTransform {
@@ -358,7 +374,9 @@ struct ConvertConstructorToDeductionGuideTransform {
         Args.addOuterRetainedLevel();
         if (NestedPattern)
           Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth());
-        NamedDecl *NewParam = transformTemplateParameter(Param, Args);
+        auto [Depth, Index] = getDepthAndIndex(Param);
+        NamedDecl *NewParam = transformTemplateParameter(
+            SemaRef, DC, Param, Args, Index + Depth1IndexAdjustment, Depth - 1);
         if (!NewParam)
           return nullptr;
         // Constraints require that we substitute depth-1 arguments
@@ -366,12 +384,11 @@ struct ConvertConstructorToDeductionGuideTransform {
         Depth1Args.push_back(SemaRef.Context.getInjectedTemplateArg(NewParam));
 
         if (NestedPattern) {
-          TemplateDeclInstantiator Instantiator(SemaRef, DC,
-                                                OuterInstantiationArgs);
-          Instantiator.setEvaluateConstraints(false);
-          SemaRef.runWithSufficientStackSpace(NewParam->getLocation(), [&] {
-            NewParam = cast<NamedDecl>(Instantiator.Visit(NewParam));
-          });
+          auto [Depth, Index] = getDepthAndIndex(NewParam);
+          NewParam = transformTemplateParameter(
+              SemaRef, DC, NewParam, OuterInstantiationArgs, Index,
+              Depth - OuterInstantiationArgs.getNumSubstitutedLevels(),
+              /*EvaluateConstraint=*/false);
         }
 
         assert(NewParam->getTemplateDepth() == 0 &&
@@ -479,25 +496,6 @@ struct ConvertConstructorToDeductionGuideTransform {
   }
 
 private:
-  /// Transform a constructor template parameter into a deduction guide template
-  /// parameter, rebuilding any internal references to earlier parameters and
-  /// renumbering as we go.
-  NamedDecl *transformTemplateParameter(NamedDecl *TemplateParam,
-                                        MultiLevelTemplateArgumentList &Args) {
-    if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-      return transformTemplateTypeParam(
-          SemaRef, DC, TTP, Args, TTP->getDepth() - 1,
-          Depth1IndexAdjustment + TTP->getIndex());
-    if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-      return transformTemplateParam(SemaRef, DC, TTP, Args,
-                                    Depth1IndexAdjustment + TTP->getIndex(),
-                                    TTP->getDepth() - 1);
-    auto *NTTP = cast<NonTypeTemplateParmDecl>(TemplateParam);
-    return transformTemplateParam(SemaRef, DC, NTTP, Args,
-                                  Depth1IndexAdjustment + NTTP->getIndex(),
-                                  NTTP->getDepth() - 1);
-  }
-
   QualType transformFunctionProtoType(
       TypeLocBuilder &TLB, FunctionProtoTypeLoc TL,
       SmallVectorImpl<ParmVarDecl *> &Params,
@@ -634,26 +632,6 @@ struct ConvertConstructorToDeductionGuideTransform {
   }
 };
 
-unsigned getTemplateParameterDepth(NamedDecl *TemplateParam) {
-  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return TTP->getDepth();
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return TTP->getDepth();
-  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return NTTP->getDepth();
-  llvm_unreachable("Unhandled template parameter types");
-}
-
-unsigned getTemplateParameterIndex(NamedDecl *TemplateParam) {
-  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return TTP->getIndex();
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return TTP->getIndex();
-  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return NTTP->getIndex();
-  llvm_unreachable("Unhandled template parameter types");
-}
-
 // Find all template parameters that appear in the given DeducedArgs.
 // Return the indices of the template parameters in the TemplateParams.
 SmallVector<unsigned> TemplateParamsReferencedInTemplateArgumentList(
@@ -689,8 +667,10 @@ SmallVector<unsigned> TemplateParamsReferencedInTemplateArgumentList(
 
     void MarkAppeared(NamedDecl *ND) {
       if (llvm::isa<NonTypeTemplateParmDecl, TemplateTypeParmDecl,
-                    TemplateTemplateParmDecl>(ND))
-        Mark(getTemplateParameterDepth(ND), getTemplateParameterIndex(ND));
+                    TemplateTemplateParmDecl>(ND)) {
+        auto [Depth, Index] = getDepthAndIndex(ND);
+        Mark(Depth, Index);
+      }
     }
     void Mark(unsigned Depth, unsigned Index) {
       if (Index < TemplateParamList->size() &&
@@ -722,20 +702,6 @@ bool hasDeclaredDeductionGuides(DeclarationName Name, DeclContext *DC) {
   return false;
 }
 
-NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC,
-                                      NamedDecl *TemplateParam,
-                                      MultiLevelTemplateArgumentList &Args,
-                                      unsigned NewIndex, unsigned NewDepth) {
-  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return transformTemplateTypeParam(SemaRef, DC, TTP, Args, NewDepth,
-                                      NewIndex);
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth);
-  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth);
-  llvm_unreachable("Unhandled template parameter types");
-}
-
 // Build the associated constraints for the alias deduction guides.
 // C++ [over.match.class.deduct]p3.3:
 //   The associated constraints ([temp.constr.decl]) are the conjunction of the
@@ -791,7 +757,7 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
     NamedDecl *NewParam = transformTemplateParameter(
         SemaRef, AliasTemplate->getDeclContext(), TP, Args,
         /*NewIndex=*/AdjustedAliasTemplateArgs.size(),
-        getTemplateParameterDepth(TP) + AdjustDepth);
+        getDepthAndIndex(TP).first + AdjustDepth);
 
     TemplateArgument NewTemplateArgument =
         Context.getInjectedTemplateArg(NewParam);
@@ -814,10 +780,10 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
       Args.setKind(TemplateSubstitutionKind::Rewrite);
       Args.addOuterTemplateArguments(TemplateArgsForBuildingRC);
       // Rebuild the template parameter with updated depth and index.
-      NamedDecl *NewParam = transformTemplateParameter(
-          SemaRef, F->getDeclContext(), TP, Args,
-          /*NewIndex=*/FirstUndeducedParamIdx,
-          getTemplateParameterDepth(TP) + AdjustDepth);
+      NamedDecl *NewParam =
+          transformTemplateParameter(SemaRef, F->getDeclContext(), TP, Args,
+                                     /*NewIndex=*/FirstUndeducedParamIdx,
+                                     getDepthAndIndex(TP).first + AdjustDepth);
       FirstUndeducedParamIdx += 1;
       assert(TemplateArgsForBuildingRC[Index].isNull());
       TemplateArgsForBuildingRC[Index] =
@@ -919,7 +885,7 @@ Expr *buildIsDeducibleConstraint(Sema &SemaRef,
       NamedDecl *NewParam = transformTemplateParameter(
           SemaRef, AliasTemplate->getDeclContext(), TP, Args,
           /*NewIndex=*/TransformedTemplateArgs.size(),
-          getTemplateParameterDepth(TP) + AdjustDepth);
+          getDepthAndIndex(TP).first + AdjustDepth);
 
       TemplateArgument NewTemplateArgument =
           Context.getInjectedTemplateArg(NewParam);
@@ -1081,8 +1047,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
     NamedDecl *NewParam = transformTemplateParameter(
         SemaRef, AliasTemplate->getDeclContext(), TP, Args,
-        /*NewIndex=*/FPrimeTemplateParams.size(),
-        getTemplateParameterDepth(TP));
+        /*NewIndex=*/FPrimeTemplateParams.size(), getDepthAndIndex(TP).first);
     FPrimeTemplateParams.push_back(NewParam);
 
     TemplateArgument NewTemplateArgument =
@@ -1101,7 +1066,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime);
     NamedDecl *NewParam = transformTemplateParameter(
         SemaRef, F->getDeclContext(), TP, Args, FPrimeTemplateParams.size(),
-        getTemplateParameterDepth(TP));
+        getDepthAndIndex(TP).first);
     FPrimeTemplateParams.push_back(NewParam);
 
     assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() &&
diff --git a/clang/test/AST/explicit-base-class-move-cntr.cpp b/clang/test/AST/explicit-base-class-move-cntr.cpp
new file mode 100644
index 00000000000000..808af2fc94336f
--- /dev/null
+++ b/clang/test/AST/explicit-base-class-move-cntr.cpp
@@ -0,0 +1,171 @@
+// RUN: %clang_cc1 -ast-dump=json %s | FileCheck -strict-whitespace %s
+
+struct ExplicitBase {
+  explicit ExplicitBase(const char *) { }
+  ExplicitBase(const ExplicitBase &) {}
+  ExplicitBase(ExplicitBase &&) {}
+  ExplicitBase &operator=(const ExplicitBase &) { return *this; }
+  ExplicitBase &operator=(ExplicitBase &&) { return *this; }
+  ~ExplicitBase() { }
+};
+
+struct Derived1 : ExplicitBase {};
+
+Derived1 makeDerived1() {
+// CHECK:  "kind": "FunctionDecl",
+// CHECK:  "name": "makeDerived1",
+
+// CHECK:    "kind": "CompoundStmt",
+
+// CHECK:      "kind": "ReturnStmt",
+// CHECK:        "kind": "ExprWithCleanups",
+// CHECK:        "type": {
+// CHECK-NEXT:     "qualType": "Derived1"
+// CHECK-NEXT:   },
+
+// CHECK:          "kind": "CXXFunctionalCastExpr",
+// CHECK:          "type": {
+// CHECK-NEXT:       "qualType": "Derived1"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "valueCategory": "prvalue",
+// CHECK-NEXT:     "castKind": "NoOp",
+
+// CHECK:            "kind": "CXXBindTemporaryExpr",
+// CHECK:            "type": {
+// CHECK-NEXT:         "qualType": "Derived1"
+// CHECK-NEXT:       },
+// CHECK-NEXT:       "valueCategory": "prvalue",
+
+// CHECK:              "kind": "InitListExpr",
+// CHECK:              "type": {
+// CHECK-NEXT:           "qualType": "Derived1"
+// CHECK-NEXT:         },
+// CHECK-NEXT:         "valueCategory": "prvalue",
+
+// CHECK:                "kind": "CXXConstructExpr",
+// CHECK:                "type": {
+// CHECK-NEXT:             "qualType": "ExplicitBase"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "valueCategory": "prvalue",
+// CHECK-NEXT:           "ctorType": {
+// CHECK-NEXT:             "qualType": "void (ExplicitBase &&)"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "hadMultipleCandidates": true,
+// CHECK-NEXT:           "constructionKind": "non-virtual base",
+
+// CHECK:                  "kind": "MaterializeTemporaryExpr",
+// CHECK:                  "type": {
+// CHECK-NEXT:               "qualType": "ExplicitBase"
+// CHECK-NEXT:             },
+// CHECK-NEXT:             "valueCategory": "xvalue",
+// CHECK-NEXT:             "storageDuration": "full expression",
+
+// CHECK:                    "kind": "CXXBindTemporaryExpr",
+// CHECK:                    "type": {
+// CHECK-NEXT:                 "qualType": "ExplicitBase"
+// CHECK-NEXT:               },
+// CHECK-NEXT:               "valueCategory": "prvalue",
+
+// CHECK:                      "kind": "CXXTemporaryObjectExpr",
+// CHECK:                      "type": {
+// CHECK-NEXT:                   "qualType": "ExplicitBase"
+// CHECK-NEXT:                 },
+// CHECK-NEXT:                 "valueCategory": "prvalue",
+// CHECK-NEXT:                 "ctorType": {
+// CHECK-NEXT:                   "qualType": "void (const char *)"
+// CHECK-NEXT:                 },
+// CHECK-NEXT:                 "list": true,
+// CHECK-NEXT:                 "hadMultipleCandidates": true,
+// CHECK-NEXT:                 "constructionKind": "complete",
+
+// CHECK:                        "kind": "ImplicitCastExpr",
+// CHECK:                        "type": {
+// CHECK-NEXT:                     "qualType": "const char *"
+// CHECK-NEXT:                   },
+// CHECK-NEXT:                   "valueCategory": "prvalue",
+// CHECK-NEXT:                   "castKind": "ArrayToPointerDecay",
+
+// CHECK:                          "kind": "StringLiteral",
+// CHECK:                          "type": {
+// CHECK-NEXT:                       "qualType": "const char[10]"
+// CHECK-NEXT:                     },
+// CHECK-NEXT:                     "valueCategory": "lvalue",
+// CHECK-NEXT:                     "value": "\"Move Ctor\""
+  return Derived1{ExplicitBase{"Move Ctor"}};
+}
+
+struct ImplicitBase {
+  ImplicitBase(const char *) { }
+  ImplicitBase(const ImplicitBase &) {}
+  ImplicitBase(ImplicitBase &&) {}
+  ImplicitBase &operator=(const ImplicitBase &) { return *this; }
+  ImplicitBase &operator=(ImplicitBase &&) { return *this; }
+  ~ImplicitBase() { }
+};
+
+struct Derived2 : ImplicitBase {};
+
+Derived2 makeDerived2() {
+// CHECK:  "kind": "FunctionDecl",
+// CHECK:  "name": "makeDerived2",
+
+// CHECK:    "kind": "CompoundStmt",
+
+// CHECK:      "kind": "ReturnStmt",
+
+// CHECK:        "kind": "ExprWithCleanups",
+// CHECK:        "type": {
+// CHECK-NEXT:     "qualType": "Derived2"
+// CHECK-NEXT:   },
+// CHECK-NEXT:   "valueCategory": "prvalue",
+// CHECK-NEXT:   "cleanupsHaveSideEffects": true,
+
+// CHECK:          "kind": "CXXFunctionalCastExpr",
+// CHECK:          "type": {
+// CHECK-NEXT:       "qualType": "Derived2"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "valueCategory": "prvalue",
+// CHECK-NEXT:     "castKind": "NoOp",
+
+// CHECK:            "kind": "CXXBindTemporaryExpr",
+// CHECK:            "type": {
+// CHECK-NEXT:         "qualType": "Derived2"
+// CHECK-NEXT:       },
+// CHECK-NEXT:       "valueCategory": "prvalue",
+
+// CHECK:              "kind": "InitListExpr",
+// CHECK:              "type": {
+// CHECK-NEXT:           "qualType": "Derived2"
+// CHECK-NEXT:         },
+// CHECK-NEXT:         "valueCategory": "prvalue",
+
+// CHECK:                "kind": "CXXConstructExpr",
+// CHECK:                "type": {
+// CHECK-NEXT:             "qualType": "ImplicitBase"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "valueCategory": "prvalue",
+// CHECK-NEXT:           "ctorType": {
+// CHECK-NEXT:             "qualType": "void (const char *)"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "list": true,
+// CHECK-NEXT:           "hadMultipleCandidates": true,
+// CHECK-NEXT:           "constructionKind": "non-virtual base",
+
+// CHECK:                  "kind": "ImplicitCastExpr",
+// CHECK:                  "type": {
+// CHECK-NEXT:               "qualType": "const char *"
+// CHECK-NEXT:             },
+// CHECK-NEXT:             "valueCategory": "prvalue",
+// CHECK-NEXT:             "castKind": "ArrayToPointerDecay",
+
+// CHECK:                    "kind": "StringLiteral",
+// CHECK:                    "type": {
+// CHECK-NEXT:                 "qualType": "const char[8]"
+// CHECK-NEXT:               },
+// CHECK-NEXT:               "valueCategory": "lvalue",
+// CHECK-NEXT:               "value": "\"No Ctor\""
+  return Derived2{{"No Ctor"}};
+}
+
+// NOTE: CHECK lines have been autogenerated by gen_ast_dump_json_test.py
+// using --filters=FunctionDecl,CompoundStmt,ReturnStmt,MaterializeTemporaryExpr,CXXBindTemporaryExpr,CXXTemporaryObjectExpr,ImplicitCastExpr,StringLiteralStringLiteral
diff --git a/clang/test/Analysis/live-stmts.cpp b/clang/test/Analysis/live-stmts.cpp
index 16954f30129f78..c60f522588e392 100644
--- a/clang/test/Analysis/live-stmts.cpp
+++ b/clang/test/Analysis/live-stmts.cpp
@@ -193,3 +193,112 @@ void test_lambda_refcapture() {
 // CHECK-NEXT: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
+
+int logicalOpInTernary(bool b) {
+  return (b || b) ? 0 : 1;
+}
+
+// [B6 (ENTRY)]
+//    |
+//    V
+// [B5 (b || ...)]
+//   |            \
+//   |             |
+//   V             V
+// [B4 (b||b)] ? [B2 (0)] : [B3 (1)]
+//                \        /
+//                 ---|----
+//                    V
+//                   [B1] --> [B0 (EXIT)]
+//                  return
+
+// CHECK: [ B0 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B1 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B2 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B3 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B4 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B5 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B6 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
diff --git a/clang/test/Analysis/short-circuiting-eval.cpp b/clang/test/Analysis/short-circuiting-eval.cpp
new file mode 100644
index 00000000000000..d0f29a849ab1ca
--- /dev/null
+++ b/clang/test/Analysis/short-circuiting-eval.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core.DivideZero -verify %s
+
+int div0LogicalOpInTernary(bool b1) {
+  int y = (b1 || b1) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0LogicalAndArith(bool b1, int x) {
+  int y = (b1 || (x < 3)) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0NestedLogicalOp(bool b1) {
+  int y = (b1 && b1 || b1 && b1) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0TernaryInTernary(bool b) {
+  int y = ((b || b) ? false : true) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0LogicalOpParensInTernary(bool b1) {
+  int y = ((((b1)) || ((b1)))) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0LogicalOpInsideStExpr(bool b1) {
+  int y = ({1; (b1 || b1);}) ? 0 : 1;
+  // expected-warning@-1 {{expression result unused}}
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0StExprInsideLogicalOp(bool b1) {
+  int y = (({1; b1;}) || ({1; b1;})) ? 0 : 1;
+  // expected-warning@-1 {{expression result unused}}
+  // expected-warning@-2 {{expression result unused}}
+  return 1 / y; // expected-warning {{Division by zero}}
+}
diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
index 25aebeced9379c..9c3d08a25945a3 100644
--- a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
+++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
@@ -3,6 +3,8 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_STREAMING %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_LOCALLY %s
 
+// REQUIRES: aarch64-registered-target
+
 #define __ai __attribute__((always_inline))
 __ai void inlined_fn(void) {}
 __ai void inlined_fn_streaming_compatible(void) __arm_streaming_compatible {}
@@ -20,7 +22,7 @@ void caller(void) {
 
 #ifdef TEST_COMPATIBLE
 void caller_compatible(void) __arm_streaming_compatible {
-    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes}}
+    inlined_fn(); // expected-warning {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes, inlining may change runtime behaviour}}
     inlined_fn_streaming_compatible();
     inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller_compatible' have mismatching streaming attributes}}
     inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller_compatible' have mismatching streaming attributes}}
@@ -29,7 +31,7 @@ void caller_compatible(void) __arm_streaming_compatible {
 
 #ifdef TEST_STREAMING
 void caller_streaming(void) __arm_streaming {
-    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes}}
+    inlined_fn(); // expected-warning {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes, inlining may change runtime behaviour}}
     inlined_fn_streaming_compatible();
     inlined_fn_streaming();
     inlined_fn_local();
@@ -39,7 +41,7 @@ void caller_streaming(void) __arm_streaming {
 #ifdef TEST_LOCALLY
 __arm_locally_streaming
 void caller_local(void) {
-    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes}}
+    inlined_fn(); // expected-warning {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes, inlining may change runtime behaviour}}
     inlined_fn_streaming_compatible();
     inlined_fn_streaming();
     inlined_fn_local();
diff --git a/clang/test/CodeGenCXX/debug-info-explicit-this.cpp b/clang/test/CodeGenCXX/debug-info-explicit-this.cpp
new file mode 100644
index 00000000000000..45ab2a0216ca7a
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-explicit-this.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -std=c++2b %s -o - | FileCheck %s
+
+struct Foo {
+  void Bar(this Foo&& self) {}
+};
+
+void fn() {
+  Foo{}.Bar();
+}
+
+// CHECK: distinct !DISubprogram(name: "Bar", {{.*}}, type: ![[BAR_TYPE:[0-9]+]], {{.*}}, declaration: ![[BAR_DECL:[0-9]+]], {{.*}}
+// CHECK: ![[FOO:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Foo"
+// CHECK: ![[BAR_DECL]] = !DISubprogram(name: "Bar", {{.*}}, type: ![[BAR_TYPE]], {{.*}},
+// CHECK: ![[BAR_TYPE]] = !DISubroutineType(types: ![[PARAMS:[0-9]+]])
+// CHECK: ![[PARAMS]] = !{null, ![[SELF:[0-9]+]]}
+// CHECK: ![[SELF]] = !DIDerivedType(tag: DW_TAG_rvalue_reference_type, baseType: ![[FOO]]
diff --git a/clang/test/Driver/immediate-options.c b/clang/test/Driver/immediate-options.c
index 77878fe2e9c58e..b74f6b41f22a0d 100644
--- a/clang/test/Driver/immediate-options.c
+++ b/clang/test/Driver/immediate-options.c
@@ -2,6 +2,8 @@
 // HELP: isystem
 // HELP-NOT: ast-dump
 // HELP-NOT: driver-mode
+// HELP:     -Wa,
+// HELP-NOT: -W{{[a-z][a-z]}}
 
 // Make sure that Flang-only options are not available in Clang
 // HELP-NOT: test-io
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 9b12caa37cf793..57c5f79651824f 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9376,29 +9376,6 @@ TEST_P(ASTImporterOptionSpecificTestBase, VaListCpp) {
       ToVaList->getUnderlyingType(), ToBuiltinVaList->getUnderlyingType()));
 }
 
-TEST_P(ASTImporterOptionSpecificTestBase,
-       ImportDefinitionOfEmptyClassWithNoUniqueAddressField) {
-  Decl *FromTU = getTuDecl(
-      R"(
-      struct B {};
-      struct A { B b; };
-      )",
-      Lang_CXX20);
-
-  CXXRecordDecl *FromD = FirstDeclMatcher<CXXRecordDecl>().match(
-      FromTU, cxxRecordDecl(hasName("A")));
-
-  for (auto *FD : FromD->fields())
-    FD->addAttr(clang::NoUniqueAddressAttr::Create(FromD->getASTContext(),
-                                                   clang::SourceRange()));
-  FromD->markEmpty();
-
-  CXXRecordDecl *ToD = Import(FromD, Lang_CXX20);
-  EXPECT_TRUE(ToD->isEmpty());
-  for (auto *FD : ToD->fields())
-    EXPECT_EQ(true, FD->hasAttr<NoUniqueAddressAttr>());
-}
-
 TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingTypedefToRecord) {
   const char *Code =
       R"(
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 1a52b82d656656..8717d9753d161b 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -9,6 +9,7 @@
 #include "TestingSupport.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/OperationKinds.h"
 #include "clang/AST/Type.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
@@ -79,7 +80,7 @@ class DataflowAnalysisTest : public Test {
 
   /// Returns the `CFGBlock` containing `S` (and asserts that it exists).
   const CFGBlock *blockForStmt(const Stmt &S) {
-    const CFGBlock *Block = ACFG->getStmtToBlock().lookup(&S);
+    const CFGBlock *Block = ACFG->blockForStmt(S);
     assert(Block != nullptr);
     return Block;
   }
@@ -370,6 +371,42 @@ TEST_F(DiscardExprStateTest, ConditionalOperator) {
   EXPECT_EQ(CallGState.Env.get<PointerValue>(AddrOfI), nullptr);
 }
 
+TEST_F(DiscardExprStateTest, CallWithParenExprTreatedCorrectly) {
+  // This is a regression test.
+  // In the CFG for `target()` below, the expression that evaluates the function
+  // pointer for `expect` and the actual call are separated into different
+  // baseic blocks (because of the control flow introduced by the `||`
+  // operator).
+  // The value for the `expect` function pointer was erroneously discarded
+  // from the environment between these two blocks because the code that
+  // determines whether the expression values for a block need to be preserved
+  // did not ignore the `ParenExpr` around `(i == 1)` (which is not represented
+  // in the CFG).
+  std::string Code = R"(
+    bool expect(bool, bool);
+    void target(int i) {
+      expect(false || (i == 1), false);
+    }
+  )";
+  auto BlockStates = llvm::cantFail(runAnalysis<NoopAnalysis>(
+      Code, [](ASTContext &C) { return NoopAnalysis(C); }));
+
+  const auto &FnToPtrDecay = matchNode<ImplicitCastExpr>(
+      implicitCastExpr(hasCastKind(CK_FunctionToPointerDecay)));
+  const auto &CallExpect =
+      matchNode<CallExpr>(callExpr(callee(functionDecl(hasName("expect")))));
+
+  // In the block that evaluates the implicit cast of `expect` to a pointer,
+  // this expression is associated with a value.
+  const auto &FnToPtrDecayState = blockStateForStmt(BlockStates, FnToPtrDecay);
+  EXPECT_NE(FnToPtrDecayState.Env.getValue(FnToPtrDecay), nullptr);
+
+  // In the block that calls `expect()`, the implicit cast of `expect` to a
+  // pointer is still associated with a value.
+  const auto &CallExpectState = blockStateForStmt(BlockStates, CallExpect);
+  EXPECT_NE(CallExpectState.Env.getValue(FnToPtrDecay), nullptr);
+}
+
 struct NonConvergingLattice {
   int State;
 
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 0632f7d5632846..937241774c1973 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -809,6 +809,10 @@ if(ANDROID)
   append_list_if(COMPILER_RT_HAS_FUSE_LD_LLD_FLAG -fuse-ld=lld SANITIZER_COMMON_LINK_FLAGS)
   append_list_if(COMPILER_RT_HAS_LLD -fuse-ld=lld COMPILER_RT_UNITTEST_LINK_FLAGS)
 endif()
+if(${COMPILER_RT_DEFAULT_TARGET_ARCH} MATCHES sparc)
+  # lld has several bugs/limitations on SPARC, so disable (Issue #100320).
+  set(COMPILER_RT_HAS_LLD FALSE)
+endif()
 pythonize_bool(COMPILER_RT_HAS_LLD)
 pythonize_bool(COMPILER_RT_TEST_USE_LLD)
 
diff --git a/compiler-rt/lib/gwp_asan/definitions.h b/compiler-rt/lib/gwp_asan/definitions.h
index bec02903893462..c6785d4aba6952 100644
--- a/compiler-rt/lib/gwp_asan/definitions.h
+++ b/compiler-rt/lib/gwp_asan/definitions.h
@@ -12,7 +12,8 @@
 #define GWP_ASAN_TLS_INITIAL_EXEC                                              \
   __thread __attribute__((tls_model("initial-exec")))
 
-#define GWP_ASAN_UNLIKELY(X) __builtin_expect(!!(X), 0)
+#define GWP_ASAN_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define GWP_ASAN_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
 #define GWP_ASAN_ALWAYS_INLINE inline __attribute__((always_inline))
 
 #define GWP_ASAN_WEAK __attribute__((weak))
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
index 3f39402652a998..5d5c729cdc8b70 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
@@ -24,13 +24,13 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
   assert((Size % State.PageSize) == 0);
   zx_handle_t Vmo;
   zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
-  check(Status == ZX_OK, "Failed to create Vmo");
+  checkWithErrorCode(Status == ZX_OK, "Failed to create Vmo", Status);
   _zx_object_set_property(Vmo, ZX_PROP_NAME, Name, strlen(Name));
   zx_vaddr_t Addr;
   Status = _zx_vmar_map(_zx_vmar_root_self(),
                         ZX_VM_PERM_READ | ZX_VM_PERM_WRITE | ZX_VM_ALLOW_FAULTS,
                         0, Vmo, 0, Size, &Addr);
-  check(Status == ZX_OK, "Vmo mapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmo mapping failed", Status);
   _zx_handle_close(Vmo);
   return reinterpret_cast<void *>(Addr);
 }
@@ -40,7 +40,7 @@ void GuardedPoolAllocator::unmap(void *Ptr, size_t Size) const {
   assert((Size % State.PageSize) == 0);
   zx_status_t Status = _zx_vmar_unmap(_zx_vmar_root_self(),
                                       reinterpret_cast<zx_vaddr_t>(Ptr), Size);
-  check(Status == ZX_OK, "Vmo unmapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmo unmapping failed", Status);
 }
 
 void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
@@ -50,7 +50,8 @@ void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
       _zx_vmar_root_self(),
       ZX_VM_CAN_MAP_READ | ZX_VM_CAN_MAP_WRITE | ZX_VM_CAN_MAP_SPECIFIC, 0,
       Size, &GuardedPagePoolPlatformData.Vmar, &Addr);
-  check(Status == ZX_OK, "Failed to reserve guarded pool allocator memory");
+  checkWithErrorCode(Status == ZX_OK,
+                     "Failed to reserve guarded pool allocator memory", Status);
   _zx_object_set_property(GuardedPagePoolPlatformData.Vmar, ZX_PROP_NAME,
                           kGwpAsanGuardPageName, strlen(kGwpAsanGuardPageName));
   return reinterpret_cast<void *>(Addr);
@@ -59,8 +60,10 @@ void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
 void GuardedPoolAllocator::unreserveGuardedPool() {
   const zx_handle_t Vmar = GuardedPagePoolPlatformData.Vmar;
   assert(Vmar != ZX_HANDLE_INVALID && Vmar != _zx_vmar_root_self());
-  check(_zx_vmar_destroy(Vmar) == ZX_OK, "Failed to destroy a vmar");
-  check(_zx_handle_close(Vmar) == ZX_OK, "Failed to close a vmar");
+  zx_status_t Status = _zx_vmar_destroy(Vmar);
+  checkWithErrorCode(Status == ZX_OK, "Failed to destroy a vmar", Status);
+  Status = _zx_handle_close(Vmar);
+  checkWithErrorCode(Status == ZX_OK, "Failed to close a vmar", Status);
   GuardedPagePoolPlatformData.Vmar = ZX_HANDLE_INVALID;
 }
 
@@ -69,7 +72,7 @@ void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
   assert((Size % State.PageSize) == 0);
   zx_handle_t Vmo;
   zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
-  check(Status == ZX_OK, "Failed to create vmo");
+  checkWithErrorCode(Status == ZX_OK, "Failed to create vmo", Status);
   _zx_object_set_property(Vmo, ZX_PROP_NAME, kGwpAsanAliveSlotName,
                           strlen(kGwpAsanAliveSlotName));
   const zx_handle_t Vmar = GuardedPagePoolPlatformData.Vmar;
@@ -81,7 +84,7 @@ void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
                         ZX_VM_PERM_READ | ZX_VM_PERM_WRITE |
                             ZX_VM_ALLOW_FAULTS | ZX_VM_SPECIFIC,
                         Offset, Vmo, 0, Size, &P);
-  check(Status == ZX_OK, "Vmo mapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmo mapping failed", Status);
   _zx_handle_close(Vmo);
 }
 
@@ -93,7 +96,7 @@ void GuardedPoolAllocator::deallocateInGuardedPool(void *Ptr,
   assert(Vmar != ZX_HANDLE_INVALID && Vmar != _zx_vmar_root_self());
   const zx_status_t Status =
       _zx_vmar_unmap(Vmar, reinterpret_cast<zx_vaddr_t>(Ptr), Size);
-  check(Status == ZX_OK, "Vmar unmapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmar unmapping failed", Status);
 }
 
 size_t GuardedPoolAllocator::getPlatformPageSize() {
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
index 549e31acb176d3..7b2e19956c7962 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
@@ -12,6 +12,7 @@
 #include "gwp_asan/utilities.h"
 
 #include <assert.h>
+#include <errno.h>
 #include <pthread.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -46,7 +47,8 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
   assert((Size % State.PageSize) == 0);
   void *Ptr = mmap(nullptr, Size, PROT_READ | PROT_WRITE,
                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  check(Ptr != MAP_FAILED, "Failed to map guarded pool allocator memory");
+  checkWithErrorCode(Ptr != MAP_FAILED,
+                     "Failed to map guarded pool allocator memory", errno);
   MaybeSetMappingName(Ptr, Size, Name);
   return Ptr;
 }
@@ -54,15 +56,16 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
 void GuardedPoolAllocator::unmap(void *Ptr, size_t Size) const {
   assert((reinterpret_cast<uintptr_t>(Ptr) % State.PageSize) == 0);
   assert((Size % State.PageSize) == 0);
-  check(munmap(Ptr, Size) == 0,
-        "Failed to unmap guarded pool allocator memory.");
+  checkWithErrorCode(munmap(Ptr, Size) == 0,
+                     "Failed to unmap guarded pool allocator memory.", errno);
 }
 
 void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
   assert((Size % State.PageSize) == 0);
   void *Ptr =
       mmap(nullptr, Size, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  check(Ptr != MAP_FAILED, "Failed to reserve guarded pool allocator memory");
+  checkWithErrorCode(Ptr != MAP_FAILED,
+                     "Failed to reserve guarded pool allocator memory", errno);
   MaybeSetMappingName(Ptr, Size, kGwpAsanGuardPageName);
   return Ptr;
 }
@@ -75,8 +78,9 @@ void GuardedPoolAllocator::unreserveGuardedPool() {
 void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
   assert((reinterpret_cast<uintptr_t>(Ptr) % State.PageSize) == 0);
   assert((Size % State.PageSize) == 0);
-  check(mprotect(Ptr, Size, PROT_READ | PROT_WRITE) == 0,
-        "Failed to allocate in guarded pool allocator memory");
+  checkWithErrorCode(mprotect(Ptr, Size, PROT_READ | PROT_WRITE) == 0,
+                     "Failed to allocate in guarded pool allocator memory",
+                     errno);
   MaybeSetMappingName(Ptr, Size, kGwpAsanAliveSlotName);
 }
 
@@ -87,9 +91,10 @@ void GuardedPoolAllocator::deallocateInGuardedPool(void *Ptr,
   // mmap() a PROT_NONE page over the address to release it to the system, if
   // we used mprotect() here the system would count pages in the quarantine
   // against the RSS.
-  check(mmap(Ptr, Size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1,
-             0) != MAP_FAILED,
-        "Failed to deallocate in guarded pool allocator memory");
+  checkWithErrorCode(
+      mmap(Ptr, Size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1,
+           0) != MAP_FAILED,
+      "Failed to deallocate in guarded pool allocator memory", errno);
   MaybeSetMappingName(Ptr, Size, kGwpAsanGuardPageName);
 }
 
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp b/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp
index bc9d3a4462a2f7..fecf94b0d1a196 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp
@@ -8,12 +8,25 @@
 
 #include "gwp_asan/utilities.h"
 
+#include <alloca.h>
+#include <stdio.h>
 #include <string.h>
 #include <zircon/sanitizer.h>
+#include <zircon/status.h>
 
 namespace gwp_asan {
 void die(const char *Message) {
   __sanitizer_log_write(Message, strlen(Message));
   __builtin_trap();
 }
+
+void dieWithErrorCode(const char *Message, int64_t ErrorCode) {
+  const char *error_str =
+      _zx_status_get_string(static_cast<zx_status_t>(ErrorCode));
+  size_t buffer_size = strlen(Message) + 32 + strlen(error_str);
+  char *buffer = static_cast<char *>(alloca(buffer_size));
+  snprintf(buffer, buffer_size, "%s (Error Code: %s)", Message, error_str);
+  __sanitizer_log_write(buffer, strlen(buffer));
+  __builtin_trap();
+}
 } // namespace gwp_asan
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp b/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp
index 73579630550956..750198062ce4f5 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp
@@ -6,7 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <alloca.h>
 #include <features.h> // IWYU pragma: keep (for __BIONIC__ macro)
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
 
 #ifdef __BIONIC__
 #include "gwp_asan/definitions.h"
@@ -27,4 +31,21 @@ void die(const char *Message) {
   __builtin_trap();
 #endif // __BIONIC__
 }
+
+void dieWithErrorCode(const char *Message, int64_t ErrorCode) {
+#ifdef __BIONIC__
+  if (&android_set_abort_message == nullptr)
+    abort();
+
+  size_t buffer_size = strlen(Message) + 48;
+  char *buffer = static_cast<char *>(alloca(buffer_size));
+  snprintf(buffer, buffer_size, "%s (Error Code: %" PRId64 ")", Message,
+           ErrorCode);
+  android_set_abort_message(buffer);
+  abort();
+#else  // __BIONIC__
+  fprintf(stderr, "%s (Error Code: %" PRId64 ")", Message, ErrorCode);
+  __builtin_trap();
+#endif // __BIONIC__
+}
 } // namespace gwp_asan
diff --git a/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt b/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
index ca43ec2a94ac4d..5de1af10eec366 100644
--- a/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
@@ -28,7 +28,9 @@ set(GWP_ASAN_UNITTESTS
   late_init.cpp
   options.cpp
   recoverable.cpp
-  never_allocated.cpp)
+  never_allocated.cpp
+  utilities.cpp
+)
 
 set(GWP_ASAN_UNIT_TEST_HEADERS
   ${GWP_ASAN_HEADERS}
diff --git a/compiler-rt/lib/gwp_asan/tests/utilities.cpp b/compiler-rt/lib/gwp_asan/tests/utilities.cpp
new file mode 100644
index 00000000000000..09a54e526f5dbc
--- /dev/null
+++ b/compiler-rt/lib/gwp_asan/tests/utilities.cpp
@@ -0,0 +1,24 @@
+//===-- utilities.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gwp_asan/utilities.h"
+#include "gwp_asan/tests/harness.h"
+
+using gwp_asan::check;
+using gwp_asan::checkWithErrorCode;
+
+TEST(UtilitiesDeathTest, CheckPrintsAsExpected) {
+  EXPECT_DEATH({ check(false, "Hello world"); }, "Hello world");
+  check(true, "Should not crash");
+  EXPECT_DEATH(
+      { checkWithErrorCode(false, "Hello world", 1337); },
+      "Hello world \\(Error Code: 1337\\)");
+  EXPECT_DEATH(
+      { checkWithErrorCode(false, "Hello world", -1337); },
+      "Hello world \\(Error Code: -1337\\)");
+}
diff --git a/compiler-rt/lib/gwp_asan/utilities.h b/compiler-rt/lib/gwp_asan/utilities.h
index 76e5df2e3eb8f8..02f450a9eeec3d 100644
--- a/compiler-rt/lib/gwp_asan/utilities.h
+++ b/compiler-rt/lib/gwp_asan/utilities.h
@@ -12,17 +12,28 @@
 #include "gwp_asan/definitions.h"
 
 #include <stddef.h>
+#include <stdint.h>
 
 namespace gwp_asan {
 // Terminates in a platform-specific way with `Message`.
 void die(const char *Message);
+void dieWithErrorCode(const char *Message, int64_t ErrorCode);
 
 // Checks that `Condition` is true, otherwise dies with `Message`.
 GWP_ASAN_ALWAYS_INLINE void check(bool Condition, const char *Message) {
-  if (Condition)
+  if (GWP_ASAN_LIKELY(Condition))
     return;
   die(Message);
 }
+
+// Checks that `Condition` is true, otherwise dies with `Message` (including
+// errno at the end).
+GWP_ASAN_ALWAYS_INLINE void
+checkWithErrorCode(bool Condition, const char *Message, int64_t ErrorCode) {
+  if (GWP_ASAN_LIKELY(Condition))
+    return;
+  dieWithErrorCode(Message, ErrorCode);
+}
 } // namespace gwp_asan
 
 #endif // GWP_ASAN_UTILITIES_H_
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index b26c1679086b95..310c0b0b5fb636 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -181,19 +181,19 @@ static void addUseDeviceClause(
 
 static void convertLoopBounds(lower::AbstractConverter &converter,
                               mlir::Location loc,
-                              mlir::omp::CollapseClauseOps &result,
+                              mlir::omp::LoopRelatedOps &result,
                               std::size_t loopVarTypeSize) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   // The types of lower bound, upper bound, and step are converted into the
   // type of the loop variable if necessary.
   mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-  for (unsigned it = 0; it < (unsigned)result.loopLBVar.size(); it++) {
-    result.loopLBVar[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopLBVar[it]);
-    result.loopUBVar[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopUBVar[it]);
-    result.loopStepVar[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopStepVar[it]);
+  for (unsigned it = 0; it < (unsigned)result.loopLowerBounds.size(); it++) {
+    result.loopLowerBounds[it] = firOpBuilder.createConvert(
+        loc, loopVarType, result.loopLowerBounds[it]);
+    result.loopUpperBounds[it] = firOpBuilder.createConvert(
+        loc, loopVarType, result.loopUpperBounds[it]);
+    result.loopSteps[it] =
+        firOpBuilder.createConvert(loc, loopVarType, result.loopSteps[it]);
   }
 }
 
@@ -203,7 +203,7 @@ static void convertLoopBounds(lower::AbstractConverter &converter,
 
 bool ClauseProcessor::processCollapse(
     mlir::Location currentLocation, lower::pft::Evaluation &eval,
-    mlir::omp::CollapseClauseOps &result,
+    mlir::omp::LoopRelatedOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const {
   bool found = false;
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -232,15 +232,15 @@ bool ClauseProcessor::processCollapse(
         std::get_if<parser::LoopControl::Bounds>(&loopControl->u);
     assert(bounds && "Expected bounds for worksharing do loop");
     lower::StatementContext stmtCtx;
-    result.loopLBVar.push_back(fir::getBase(
+    result.loopLowerBounds.push_back(fir::getBase(
         converter.genExprValue(*semantics::GetExpr(bounds->lower), stmtCtx)));
-    result.loopUBVar.push_back(fir::getBase(
+    result.loopUpperBounds.push_back(fir::getBase(
         converter.genExprValue(*semantics::GetExpr(bounds->upper), stmtCtx)));
     if (bounds->step) {
-      result.loopStepVar.push_back(fir::getBase(
+      result.loopSteps.push_back(fir::getBase(
           converter.genExprValue(*semantics::GetExpr(bounds->step), stmtCtx)));
     } else { // If `step` is not present, assume it as `1`.
-      result.loopStepVar.push_back(firOpBuilder.createIntegerConstant(
+      result.loopSteps.push_back(firOpBuilder.createIntegerConstant(
           currentLocation, firOpBuilder.getIntegerType(32), 1));
     }
     iv.push_back(bounds->name.thing.symbol);
@@ -291,8 +291,7 @@ bool ClauseProcessor::processDevice(lower::StatementContext &stmtCtx,
       }
     }
     const auto &deviceExpr = std::get<omp::SomeExpr>(clause->t);
-    result.deviceVar =
-        fir::getBase(converter.genExprValue(deviceExpr, stmtCtx));
+    result.device = fir::getBase(converter.genExprValue(deviceExpr, stmtCtx));
     return true;
   }
   return false;
@@ -322,10 +321,10 @@ bool ClauseProcessor::processDistSchedule(
     lower::StatementContext &stmtCtx,
     mlir::omp::DistScheduleClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::DistSchedule>()) {
-    result.distScheduleStaticAttr = converter.getFirOpBuilder().getUnitAttr();
+    result.distScheduleStatic = converter.getFirOpBuilder().getUnitAttr();
     const auto &chunkSize = std::get<std::optional<ExprTy>>(clause->t);
     if (chunkSize)
-      result.distScheduleChunkSizeVar =
+      result.distScheduleChunkSize =
           fir::getBase(converter.genExprValue(*chunkSize, stmtCtx));
     return true;
   }
@@ -335,7 +334,7 @@ bool ClauseProcessor::processDistSchedule(
 bool ClauseProcessor::processFilter(lower::StatementContext &stmtCtx,
                                     mlir::omp::FilterClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Filter>()) {
-    result.filteredThreadIdVar =
+    result.filteredThreadId =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
@@ -351,7 +350,7 @@ bool ClauseProcessor::processFinal(lower::StatementContext &stmtCtx,
 
     mlir::Value finalVal =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
-    result.finalVar = firOpBuilder.createConvert(
+    result.final = firOpBuilder.createConvert(
         clauseLocation, firOpBuilder.getI1Type(), finalVal);
     return true;
   }
@@ -362,7 +361,7 @@ bool ClauseProcessor::processHint(mlir::omp::HintClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Hint>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     int64_t hintValue = *evaluate::ToInt64(clause->v);
-    result.hintAttr = firOpBuilder.getI64IntegerAttr(hintValue);
+    result.hint = firOpBuilder.getI64IntegerAttr(hintValue);
     return true;
   }
   return false;
@@ -370,11 +369,11 @@ bool ClauseProcessor::processHint(mlir::omp::HintClauseOps &result) const {
 
 bool ClauseProcessor::processMergeable(
     mlir::omp::MergeableClauseOps &result) const {
-  return markClauseOccurrence<omp::clause::Mergeable>(result.mergeableAttr);
+  return markClauseOccurrence<omp::clause::Mergeable>(result.mergeable);
 }
 
 bool ClauseProcessor::processNowait(mlir::omp::NowaitClauseOps &result) const {
-  return markClauseOccurrence<omp::clause::Nowait>(result.nowaitAttr);
+  return markClauseOccurrence<omp::clause::Nowait>(result.nowait);
 }
 
 bool ClauseProcessor::processNumTeams(
@@ -385,7 +384,7 @@ bool ClauseProcessor::processNumTeams(
   if (auto *clause = findUniqueClause<omp::clause::NumTeams>()) {
     // auto lowerBound = std::get<std::optional<ExprTy>>(clause->t);
     auto &upperBound = std::get<ExprTy>(clause->t);
-    result.numTeamsUpperVar =
+    result.numTeamsUpper =
         fir::getBase(converter.genExprValue(upperBound, stmtCtx));
     return true;
   }
@@ -397,7 +396,7 @@ bool ClauseProcessor::processNumThreads(
     mlir::omp::NumThreadsClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::NumThreads>()) {
     // OMPIRBuilder expects `NUM_THREADS` clause as a `Value`.
-    result.numThreadsVar =
+    result.numThreads =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
@@ -408,17 +407,17 @@ bool ClauseProcessor::processOrder(mlir::omp::OrderClauseOps &result) const {
   using Order = omp::clause::Order;
   if (auto *clause = findUniqueClause<Order>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    result.orderAttr = mlir::omp::ClauseOrderKindAttr::get(
+    result.order = mlir::omp::ClauseOrderKindAttr::get(
         firOpBuilder.getContext(), mlir::omp::ClauseOrderKind::Concurrent);
     const auto &modifier =
         std::get<std::optional<Order::OrderModifier>>(clause->t);
     if (modifier && *modifier == Order::OrderModifier::Unconstrained) {
-      result.orderModAttr = mlir::omp::OrderModifierAttr::get(
+      result.orderMod = mlir::omp::OrderModifierAttr::get(
           firOpBuilder.getContext(), mlir::omp::OrderModifier::unconstrained);
     } else {
       // "If order-modifier is not unconstrained, the behavior is as if the
       // reproducible modifier is present."
-      result.orderModAttr = mlir::omp::OrderModifierAttr::get(
+      result.orderMod = mlir::omp::OrderModifierAttr::get(
           firOpBuilder.getContext(), mlir::omp::OrderModifier::reproducible);
     }
     return true;
@@ -433,7 +432,7 @@ bool ClauseProcessor::processOrdered(
     int64_t orderedClauseValue = 0l;
     if (clause->v.has_value())
       orderedClauseValue = *evaluate::ToInt64(*clause->v);
-    result.orderedAttr = firOpBuilder.getI64IntegerAttr(orderedClauseValue);
+    result.ordered = firOpBuilder.getI64IntegerAttr(orderedClauseValue);
     return true;
   }
   return false;
@@ -443,8 +442,7 @@ bool ClauseProcessor::processPriority(
     lower::StatementContext &stmtCtx,
     mlir::omp::PriorityClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Priority>()) {
-    result.priorityVar =
-        fir::getBase(converter.genExprValue(clause->v, stmtCtx));
+    result.priority = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
@@ -454,7 +452,7 @@ bool ClauseProcessor::processProcBind(
     mlir::omp::ProcBindClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::ProcBind>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    result.procBindKindAttr = genProcBindKindAttr(firOpBuilder, *clause);
+    result.procBindKind = genProcBindKindAttr(firOpBuilder, *clause);
     return true;
   }
   return false;
@@ -465,7 +463,7 @@ bool ClauseProcessor::processSafelen(
   if (auto *clause = findUniqueClause<omp::clause::Safelen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     const std::optional<std::int64_t> safelenVal = evaluate::ToInt64(clause->v);
-    result.safelenAttr = firOpBuilder.getI64IntegerAttr(*safelenVal);
+    result.safelen = firOpBuilder.getI64IntegerAttr(*safelenVal);
     return true;
   }
   return false;
@@ -498,19 +496,19 @@ bool ClauseProcessor::processSchedule(
       break;
     }
 
-    result.scheduleValAttr =
+    result.scheduleKind =
         mlir::omp::ClauseScheduleKindAttr::get(context, scheduleKind);
 
-    mlir::omp::ScheduleModifier scheduleModifier = getScheduleModifier(*clause);
-    if (scheduleModifier != mlir::omp::ScheduleModifier::none)
-      result.scheduleModAttr =
-          mlir::omp::ScheduleModifierAttr::get(context, scheduleModifier);
+    mlir::omp::ScheduleModifier scheduleMod = getScheduleModifier(*clause);
+    if (scheduleMod != mlir::omp::ScheduleModifier::none)
+      result.scheduleMod =
+          mlir::omp::ScheduleModifierAttr::get(context, scheduleMod);
 
     if (getSimdModifier(*clause) != mlir::omp::ScheduleModifier::none)
-      result.scheduleSimdAttr = firOpBuilder.getUnitAttr();
+      result.scheduleSimd = firOpBuilder.getUnitAttr();
 
     if (const auto &chunkExpr = std::get<omp::MaybeExpr>(clause->t))
-      result.scheduleChunkVar =
+      result.scheduleChunk =
           fir::getBase(converter.genExprValue(*chunkExpr, stmtCtx));
 
     return true;
@@ -523,7 +521,7 @@ bool ClauseProcessor::processSimdlen(
   if (auto *clause = findUniqueClause<omp::clause::Simdlen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     const std::optional<std::int64_t> simdlenVal = evaluate::ToInt64(clause->v);
-    result.simdlenAttr = firOpBuilder.getI64IntegerAttr(*simdlenVal);
+    result.simdlen = firOpBuilder.getI64IntegerAttr(*simdlenVal);
     return true;
   }
   return false;
@@ -533,7 +531,7 @@ bool ClauseProcessor::processThreadLimit(
     lower::StatementContext &stmtCtx,
     mlir::omp::ThreadLimitClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::ThreadLimit>()) {
-    result.threadLimitVar =
+    result.threadLimit =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
@@ -541,7 +539,7 @@ bool ClauseProcessor::processThreadLimit(
 }
 
 bool ClauseProcessor::processUntied(mlir::omp::UntiedClauseOps &result) const {
-  return markClauseOccurrence<omp::clause::Untied>(result.untiedAttr);
+  return markClauseOccurrence<omp::clause::Untied>(result.untied);
 }
 
 //===----------------------------------------------------------------------===//
@@ -565,7 +563,7 @@ static void
 addAlignedClause(lower::AbstractConverter &converter,
                  const omp::clause::Aligned &clause,
                  llvm::SmallVectorImpl<mlir::Value> &alignedVars,
-                 llvm::SmallVectorImpl<mlir::Attribute> &alignmentAttrs) {
+                 llvm::SmallVectorImpl<mlir::Attribute> &alignments) {
   using Aligned = omp::clause::Aligned;
   lower::StatementContext stmtCtx;
   mlir::IntegerAttr alignmentValueAttr;
@@ -594,7 +592,7 @@ addAlignedClause(lower::AbstractConverter &converter,
     alignmentValueAttr = builder.getI64IntegerAttr(alignment);
     // All the list items in a aligned clause will have same alignment
     for (std::size_t i = 0; i < objects.size(); i++)
-      alignmentAttrs.push_back(alignmentValueAttr);
+      alignments.push_back(alignmentValueAttr);
   }
 }
 
@@ -603,7 +601,7 @@ bool ClauseProcessor::processAligned(
   return findRepeatableClause<omp::clause::Aligned>(
       [&](const omp::clause::Aligned &clause, const parser::CharBlock &) {
         addAlignedClause(converter, clause, result.alignedVars,
-                         result.alignmentAttrs);
+                         result.alignments);
       });
 }
 
@@ -798,7 +796,7 @@ bool ClauseProcessor::processCopyprivate(
     result.copyprivateVars.push_back(cpVar);
     mlir::func::FuncOp funcOp =
         createCopyFunc(currentLocation, converter, cpVar.getType(), attrs);
-    result.copyprivateFuncs.push_back(mlir::SymbolRefAttr::get(funcOp));
+    result.copyprivateSyms.push_back(mlir::SymbolRefAttr::get(funcOp));
   };
 
   bool hasCopyPrivate = findRepeatableClause<clause::Copyprivate>(
@@ -832,7 +830,7 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const {
 
         mlir::omp::ClauseTaskDependAttr dependTypeOperand =
             genDependKindAttr(firOpBuilder, kind);
-        result.dependTypeAttrs.append(objects.size(), dependTypeOperand);
+        result.dependKinds.append(objects.size(), dependTypeOperand);
 
         for (const omp::Object &object : objects) {
           assert(object.ref() && "Expecting designator");
@@ -1037,10 +1035,9 @@ bool ClauseProcessor::processReduction(
 
         // Copy local lists into the output.
         llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
-        llvm::copy(reduceVarByRef,
-                   std::back_inserter(result.reductionVarsByRef));
+        llvm::copy(reduceVarByRef, std::back_inserter(result.reductionByref));
         llvm::copy(reductionDeclSymbols,
-                   std::back_inserter(result.reductionDeclSymbols));
+                   std::back_inserter(result.reductionSyms));
 
         if (outReductionTypes) {
           outReductionTypes->reserve(outReductionTypes->size() +
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 4340c3278cebc9..2c4b3857fda9f3 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -55,7 +55,7 @@ class ClauseProcessor {
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool
   processCollapse(mlir::Location currentLocation, lower::pft::Evaluation &eval,
-                  mlir::omp::CollapseClauseOps &result,
+                  mlir::omp::LoopRelatedOps &result,
                   llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const;
   bool processDefault() const;
   bool processDevice(lower::StatementContext &stmtCtx,
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index a340b62eb7b66f..78f83ede570a62 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -273,8 +273,9 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
       mlir::Value cmpOp;
       llvm::SmallVector<mlir::Value> vs;
       vs.reserve(loopOp.getIVs().size());
-      for (auto [iv, ub, step] : llvm::zip_equal(
-               loopOp.getIVs(), loopOp.getUpperBound(), loopOp.getStep())) {
+      for (auto [iv, ub, step] :
+           llvm::zip_equal(loopOp.getIVs(), loopOp.getLoopUpperBounds(),
+                           loopOp.getLoopSteps())) {
         // v = iv + step
         // cmp = step < 0 ? v < ub : v > ub
         mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
@@ -593,7 +594,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
   }();
 
   if (clauseOps) {
-    clauseOps->privatizers.push_back(mlir::SymbolRefAttr::get(privatizerOp));
+    clauseOps->privateSyms.push_back(mlir::SymbolRefAttr::get(privatizerOp));
     clauseOps->privateVars.push_back(hsb.getAddr());
   }
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 1f280a02a6550a..2b1839b5270d4f 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -284,7 +284,7 @@ static void getDeclareTargetInfo(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
-    mlir::omp::DeclareTargetClauseOps &clauseOps,
+    mlir::omp::DeclareTargetOperands &clauseOps,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
   const auto &spec =
       std::get<parser::OmpDeclareTargetSpecifier>(declareTargetConstruct.t);
@@ -322,7 +322,7 @@ static void collectDeferredDeclareTargets(
     const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
     llvm::SmallVectorImpl<lower::OMPDeferredDeclareTargetInfo>
         &deferredDeclareTarget) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
+  mlir::omp::DeclareTargetOperands clauseOps;
   llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
   getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
                        clauseOps, symbolAndClause);
@@ -347,7 +347,7 @@ getDeclareTargetFunctionDevice(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
+  mlir::omp::DeclareTargetOperands clauseOps;
   llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
   getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
                        clauseOps, symbolAndClause);
@@ -929,7 +929,7 @@ genBodyOfTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                 std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                 llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
             mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType());
-        targetOp.getMapOperandsMutable().append(mapOp);
+        targetOp.getMapVarsMutable().append(mapOp);
         mlir::Value clonedValArg =
             region.addArgument(copyVal.getType(), copyVal.getLoc());
         firOpBuilder.setInsertionPointToStart(regionBlock);
@@ -1022,15 +1022,13 @@ static OpTy genWrapperOp(lower::AbstractConverter &converter,
 // Code generation functions for clauses
 //===----------------------------------------------------------------------===//
 
-static void genCriticalDeclareClauses(lower::AbstractConverter &converter,
-                                      semantics::SemanticsContext &semaCtx,
-                                      const List<Clause> &clauses,
-                                      mlir::Location loc,
-                                      mlir::omp::CriticalClauseOps &clauseOps,
-                                      llvm::StringRef name) {
+static void genCriticalDeclareClauses(
+    lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
+    const List<Clause> &clauses, mlir::Location loc,
+    mlir::omp::CriticalDeclareOperands &clauseOps, llvm::StringRef name) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processHint(clauseOps);
-  clauseOps.criticalNameAttr =
+  clauseOps.symName =
       mlir::StringAttr::get(converter.getFirOpBuilder().getContext(), name);
 }
 
@@ -1039,7 +1037,7 @@ static void genDistributeClauses(lower::AbstractConverter &converter,
                                  lower::StatementContext &stmtCtx,
                                  const List<Clause> &clauses,
                                  mlir::Location loc,
-                                 mlir::omp::DistributeClauseOps &clauseOps) {
+                                 mlir::omp::DistributeOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDistSchedule(stmtCtx, clauseOps);
@@ -1063,18 +1061,18 @@ static void
 genLoopNestClauses(lower::AbstractConverter &converter,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval, const List<Clause> &clauses,
-                   mlir::Location loc, mlir::omp::LoopNestClauseOps &clauseOps,
+                   mlir::Location loc, mlir::omp::LoopNestOperands &clauseOps,
                    llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processCollapse(loc, eval, clauseOps, iv);
-  clauseOps.loopInclusiveAttr = converter.getFirOpBuilder().getUnitAttr();
+  clauseOps.loopInclusive = converter.getFirOpBuilder().getUnitAttr();
 }
 
 static void genMaskedClauses(lower::AbstractConverter &converter,
                              semantics::SemanticsContext &semaCtx,
                              lower::StatementContext &stmtCtx,
                              const List<Clause> &clauses, mlir::Location loc,
-                             mlir::omp::MaskedClauseOps &clauseOps) {
+                             mlir::omp::MaskedOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processFilter(stmtCtx, clauseOps);
 }
@@ -1083,7 +1081,7 @@ static void
 genOrderedRegionClauses(lower::AbstractConverter &converter,
                         semantics::SemanticsContext &semaCtx,
                         const List<Clause> &clauses, mlir::Location loc,
-                        mlir::omp::OrderedRegionClauseOps &clauseOps) {
+                        mlir::omp::OrderedRegionOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processTODO<clause::Simd>(loc, llvm::omp::Directive::OMPD_ordered);
 }
@@ -1091,7 +1089,7 @@ genOrderedRegionClauses(lower::AbstractConverter &converter,
 static void genParallelClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, mlir::omp::ParallelClauseOps &clauseOps,
+    mlir::Location loc, mlir::omp::ParallelOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1106,7 +1104,7 @@ static void genParallelClauses(
 static void genSectionsClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     const List<Clause> &clauses, mlir::Location loc,
-    mlir::omp::SectionsClauseOps &clauseOps,
+    mlir::omp::SectionsOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1119,7 +1117,7 @@ static void genSectionsClauses(
 static void genSimdClauses(lower::AbstractConverter &converter,
                            semantics::SemanticsContext &semaCtx,
                            const List<Clause> &clauses, mlir::Location loc,
-                           mlir::omp::SimdClauseOps &clauseOps) {
+                           mlir::omp::SimdOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAligned(clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
@@ -1136,7 +1134,7 @@ static void genSimdClauses(lower::AbstractConverter &converter,
 static void genSingleClauses(lower::AbstractConverter &converter,
                              semantics::SemanticsContext &semaCtx,
                              const List<Clause> &clauses, mlir::Location loc,
-                             mlir::omp::SingleClauseOps &clauseOps) {
+                             mlir::omp::SingleOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processCopyprivate(loc, clauseOps);
@@ -1148,7 +1146,7 @@ static void genTargetClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
     mlir::Location loc, bool processHostOnlyClauses,
-    mlir::omp::TargetClauseOps &clauseOps,
+    mlir::omp::TargetOperands &clauseOps,
     llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms,
     llvm::SmallVectorImpl<mlir::Location> &mapLocs,
     llvm::SmallVectorImpl<mlir::Type> &mapTypes,
@@ -1185,7 +1183,7 @@ static void genTargetClauses(
 static void genTargetDataClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, mlir::omp::TargetDataClauseOps &clauseOps,
+    mlir::Location loc, mlir::omp::TargetDataOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
     llvm::SmallVectorImpl<const semantics::Symbol *> &useDeviceSyms) {
@@ -1218,7 +1216,7 @@ static void genTargetEnterExitUpdateDataClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
     mlir::Location loc, llvm::omp::Directive directive,
-    mlir::omp::TargetEnterExitUpdateDataClauseOps &clauseOps) {
+    mlir::omp::TargetEnterExitUpdateDataOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processDepend(clauseOps);
   cp.processDevice(stmtCtx, clauseOps);
@@ -1237,7 +1235,7 @@ static void genTaskClauses(lower::AbstractConverter &converter,
                            semantics::SemanticsContext &semaCtx,
                            lower::StatementContext &stmtCtx,
                            const List<Clause> &clauses, mlir::Location loc,
-                           mlir::omp::TaskClauseOps &clauseOps) {
+                           mlir::omp::TaskOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDefault();
@@ -1256,7 +1254,7 @@ static void genTaskClauses(lower::AbstractConverter &converter,
 static void genTaskgroupClauses(lower::AbstractConverter &converter,
                                 semantics::SemanticsContext &semaCtx,
                                 const List<Clause> &clauses, mlir::Location loc,
-                                mlir::omp::TaskgroupClauseOps &clauseOps) {
+                                mlir::omp::TaskgroupOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processTODO<clause::TaskReduction>(loc,
@@ -1266,7 +1264,7 @@ static void genTaskgroupClauses(lower::AbstractConverter &converter,
 static void genTaskwaitClauses(lower::AbstractConverter &converter,
                                semantics::SemanticsContext &semaCtx,
                                const List<Clause> &clauses, mlir::Location loc,
-                               mlir::omp::TaskwaitClauseOps &clauseOps) {
+                               mlir::omp::TaskwaitOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processTODO<clause::Depend, clause::Nowait>(
       loc, llvm::omp::Directive::OMPD_taskwait);
@@ -1276,7 +1274,7 @@ static void genTeamsClauses(lower::AbstractConverter &converter,
                             semantics::SemanticsContext &semaCtx,
                             lower::StatementContext &stmtCtx,
                             const List<Clause> &clauses, mlir::Location loc,
-                            mlir::omp::TeamsClauseOps &clauseOps) {
+                            mlir::omp::TeamsOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDefault();
@@ -1291,7 +1289,7 @@ static void genTeamsClauses(lower::AbstractConverter &converter,
 static void genWsloopClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, mlir::omp::WsloopClauseOps &clauseOps,
+    mlir::Location loc, mlir::omp::WsloopOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1332,7 +1330,7 @@ genCriticalOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
     mlir::ModuleOp mod = firOpBuilder.getModule();
     auto global = mod.lookupSymbol<mlir::omp::CriticalDeclareOp>(nameStr);
     if (!global) {
-      mlir::omp::CriticalClauseOps clauseOps;
+      mlir::omp::CriticalDeclareOperands clauseOps;
       genCriticalDeclareClauses(converter, semaCtx, item->clauses, loc,
                                 clauseOps, nameStr);
 
@@ -1367,7 +1365,7 @@ genLoopNestOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
               const ConstructQueue &queue, ConstructQueue::iterator item,
-              mlir::omp::LoopNestClauseOps &clauseOps,
+              mlir::omp::LoopNestOperands &clauseOps,
               llvm::ArrayRef<const semantics::Symbol *> iv,
               llvm::ArrayRef<const semantics::Symbol *> wrapperSyms,
               llvm::ArrayRef<mlir::BlockArgument> wrapperArgs,
@@ -1395,7 +1393,7 @@ genMaskedOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
             mlir::Location loc, const ConstructQueue &queue,
             ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::MaskedClauseOps clauseOps;
+  mlir::omp::MaskedOperands clauseOps;
   genMaskedClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::MaskedOp>(
@@ -1429,7 +1427,7 @@ genOrderedRegionOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval, mlir::Location loc,
                    const ConstructQueue &queue, ConstructQueue::iterator item) {
-  mlir::omp::OrderedRegionClauseOps clauseOps;
+  mlir::omp::OrderedRegionOperands clauseOps;
   genOrderedRegionClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
@@ -1443,7 +1441,7 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
               const ConstructQueue &queue, ConstructQueue::iterator item,
-              mlir::omp::ParallelClauseOps &clauseOps,
+              mlir::omp::ParallelOperands &clauseOps,
               llvm::ArrayRef<const semantics::Symbol *> reductionSyms,
               llvm::ArrayRef<mlir::Type> reductionTypes) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -1534,7 +1532,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               const parser::OmpSectionBlocks &sectionBlocks) {
   llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
-  mlir::omp::SectionsClauseOps clauseOps;
+  mlir::omp::SectionsOperands clauseOps;
   genSectionsClauses(converter, semaCtx, item->clauses, loc, clauseOps,
                      reductionTypes, reductionSyms);
 
@@ -1635,7 +1633,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   // Emit implicit barrier to synchronize threads and avoid data
   // races on post-update of lastprivate variables when `nowait`
   // clause is present.
-  if (clauseOps.nowaitAttr && !lastprivates.empty())
+  if (clauseOps.nowait && !lastprivates.empty())
     builder.create<mlir::omp::BarrierOp>(loc);
 
   symTable.popScope();
@@ -1647,7 +1645,7 @@ genSingleOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
             semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
             mlir::Location loc, const ConstructQueue &queue,
             ConstructQueue::iterator item) {
-  mlir::omp::SingleClauseOps clauseOps;
+  mlir::omp::SingleOperands clauseOps;
   genSingleClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::SingleOp>(
@@ -1669,7 +1667,7 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
       !llvm::cast<mlir::omp::OffloadModuleInterface>(*converter.getModuleOp())
            .getIsTargetDevice();
 
-  mlir::omp::TargetClauseOps clauseOps;
+  mlir::omp::TargetOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> mapSyms, devicePtrSyms,
       deviceAddrSyms;
   llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
@@ -1797,7 +1795,7 @@ genTargetDataOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                 lower::pft::Evaluation &eval, mlir::Location loc,
                 const ConstructQueue &queue, ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::TargetDataClauseOps clauseOps;
+  mlir::omp::TargetDataOperands clauseOps;
   llvm::SmallVector<mlir::Type> useDeviceTypes;
   llvm::SmallVector<mlir::Location> useDeviceLocs;
   llvm::SmallVector<const semantics::Symbol *> useDeviceSyms;
@@ -1835,7 +1833,7 @@ static OpTy genTargetEnterExitUpdateDataOp(lower::AbstractConverter &converter,
     llvm_unreachable("Unexpected TARGET DATA construct");
   }
 
-  mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps;
+  mlir::omp::TargetEnterExitUpdateDataOperands clauseOps;
   genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx,
                                       item->clauses, loc, directive, clauseOps);
 
@@ -1848,7 +1846,7 @@ genTaskOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
           mlir::Location loc, const ConstructQueue &queue,
           ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::TaskClauseOps clauseOps;
+  mlir::omp::TaskOperands clauseOps;
   genTaskClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskOp>(
@@ -1863,7 +1861,7 @@ genTaskgroupOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                semantics::SemanticsContext &semaCtx,
                lower::pft::Evaluation &eval, mlir::Location loc,
                const ConstructQueue &queue, ConstructQueue::iterator item) {
-  mlir::omp::TaskgroupClauseOps clauseOps;
+  mlir::omp::TaskgroupOperands clauseOps;
   genTaskgroupClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskgroupOp>(
@@ -1878,7 +1876,7 @@ genTaskwaitOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
               const ConstructQueue &queue, ConstructQueue::iterator item) {
-  mlir::omp::TaskwaitClauseOps clauseOps;
+  mlir::omp::TaskwaitOperands clauseOps;
   genTaskwaitClauses(converter, semaCtx, item->clauses, loc, clauseOps);
   return converter.getFirOpBuilder().create<mlir::omp::TaskwaitOp>(loc,
                                                                    clauseOps);
@@ -1898,7 +1896,7 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::TeamsClauseOps clauseOps;
+  mlir::omp::TeamsOperands clauseOps;
   genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TeamsOp>(
@@ -1920,11 +1918,11 @@ static void genStandaloneDistribute(
     ConstructQueue::iterator item, DataSharingProcessor &dsp) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::DistributeClauseOps distributeClauseOps;
+  mlir::omp::DistributeOperands distributeClauseOps;
   genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                        distributeClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -1948,13 +1946,13 @@ static void genStandaloneDo(lower::AbstractConverter &converter,
                             DataSharingProcessor &dsp) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::WsloopClauseOps wsloopClauseOps;
+  mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   llvm::SmallVector<mlir::Type> reductionTypes;
   genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                    wsloopClauseOps, reductionTypes, reductionSyms);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -1978,7 +1976,7 @@ static void genStandaloneParallel(lower::AbstractConverter &converter,
                                   ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::ParallelClauseOps clauseOps;
+  mlir::omp::ParallelOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   llvm::SmallVector<mlir::Type> reductionTypes;
   genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps,
@@ -1995,10 +1993,10 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
                               const ConstructQueue &queue,
                               ConstructQueue::iterator item,
                               DataSharingProcessor &dsp) {
-  mlir::omp::SimdClauseOps simdClauseOps;
+  mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -2049,14 +2047,14 @@ static void genCompositeDistributeSimd(
   lower::StatementContext stmtCtx;
 
   // Clause processing.
-  mlir::omp::DistributeClauseOps distributeClauseOps;
+  mlir::omp::DistributeOperands distributeClauseOps;
   genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                        distributeClauseOps);
 
-  mlir::omp::SimdClauseOps simdClauseOps;
+  mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -2095,16 +2093,16 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
   lower::StatementContext stmtCtx;
 
   // Clause processing.
-  mlir::omp::WsloopClauseOps wsloopClauseOps;
+  mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
   llvm::SmallVector<mlir::Type> wsloopReductionTypes;
   genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                    wsloopClauseOps, wsloopReductionTypes, wsloopReductionSyms);
 
-  mlir::omp::SimdClauseOps simdClauseOps;
+  mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -2315,7 +2313,7 @@ static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
+  mlir::omp::DeclareTargetOperands clauseOps;
   llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
   mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
   getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
diff --git a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
index 35203fe89f5bc4..ddaa3c5f404f0b 100644
--- a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
@@ -109,10 +109,10 @@ class OMPMapInfoFinalizationPass
     if (auto mapClauseOwner =
             llvm::dyn_cast<mlir::omp::MapClauseOwningOpInterface>(target)) {
       llvm::SmallVector<mlir::Value> newMapOps;
-      mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapOperands();
+      mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars();
 
-      for (size_t i = 0; i < mapOperandsArr.size(); ++i) {
-        if (mapOperandsArr[i] == op) {
+      for (size_t i = 0; i < mapVarsArr.size(); ++i) {
+        if (mapVarsArr[i] == op) {
           // Push new implicit maps generated for the descriptor.
           newMapOps.push_back(baseAddr);
 
@@ -120,13 +120,13 @@ class OMPMapInfoFinalizationPass
           // new additional map operand with an appropriate BlockArgument,
           // as the printing and later processing currently requires a 1:1
           // mapping of BlockArgs to MapInfoOp's at the same placement in
-          // each array (BlockArgs and MapOperands).
+          // each array (BlockArgs and MapVars).
           if (auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target))
             targetOp.getRegion().insertArgument(i, baseAddr.getType(), loc);
         }
-        newMapOps.push_back(mapOperandsArr[i]);
+        newMapOps.push_back(mapVarsArr[i]);
       }
-      mapClauseOwner.getMapOperandsMutable().assign(newMapOps);
+      mapClauseOwner.getMapVarsMutable().assign(newMapOps);
     }
 
     mlir::Value newDescParentMapOp = builder.create<mlir::omp::MapInfoOp>(
@@ -196,27 +196,27 @@ class OMPMapInfoFinalizationPass
       return;
 
     llvm::SmallVector<mlir::Value> newMapOps;
-    mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapOperands();
+    mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars();
     auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target);
 
-    for (size_t i = 0; i < mapOperandsArr.size(); ++i) {
-      if (mapOperandsArr[i] == op) {
+    for (size_t i = 0; i < mapVarsArr.size(); ++i) {
+      if (mapVarsArr[i] == op) {
         for (auto [j, mapMember] : llvm::enumerate(op.getMembers())) {
           newMapOps.push_back(mapMember);
           // for TargetOp's which have IsolatedFromAbove we must align the
           // new additional map operand with an appropriate BlockArgument,
           // as the printing and later processing currently requires a 1:1
           // mapping of BlockArgs to MapInfoOp's at the same placement in
-          // each array (BlockArgs and MapOperands).
+          // each array (BlockArgs and MapVars).
           if (targetOp) {
             targetOp.getRegion().insertArgument(i + j, mapMember.getType(),
                                                 targetOp->getLoc());
           }
         }
       }
-      newMapOps.push_back(mapOperandsArr[i]);
+      newMapOps.push_back(mapVarsArr[i]);
     }
-    mapClauseOwner.getMapOperandsMutable().assign(newMapOps);
+    mapClauseOwner.getMapVarsMutable().assign(newMapOps);
   }
 
   // This pass executes on omp::MapInfoOp's containing descriptor based types
diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90
index c7cc3bffb43b53..f8876c96d76c97 100644
--- a/flang/test/Driver/omp-driver-offload.f90
+++ b/flang/test/Driver/omp-driver-offload.f90
@@ -14,12 +14,12 @@
 ! Test regular -fopenmp with offload, and invocation filtering options
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-HOST-AND-DEVICE
 
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 --offload-host-device \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-HOST-AND-DEVICE
 
 ! OFFLOAD-HOST-AND-DEVICE: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
@@ -29,7 +29,7 @@
 
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 --offload-host-only \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-HOST
 
 ! OFFLOAD-HOST: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
@@ -39,7 +39,7 @@
 
 ! RUN: %flang -S -### %s 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 --offload-device-only \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-DEVICE
 
 ! OFFLOAD-DEVICE: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
@@ -48,13 +48,13 @@
 ! OFFLOAD-DEVICE-NOT: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 
 ! Test regular -fopenmp with offload for basic fopenmp-is-target-device flag addition and correct fopenmp 
-! RUN: %flang -### -fopenmp --offload-arch=gfx90a -fopenmp-targets=amdgcn-amd-amdhsa %s 2>&1 | FileCheck --check-prefixes=CHECK-OPENMP-IS-TARGET-DEVICE %s
+! RUN: %flang -### -fopenmp --offload-arch=gfx90a -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib %s 2>&1 | FileCheck --check-prefixes=CHECK-OPENMP-IS-TARGET-DEVICE %s
 ! CHECK-OPENMP-IS-TARGET-DEVICE: "{{[^"]*}}flang-new" "-fc1" {{.*}} "-fopenmp" {{.*}} "-fopenmp-is-target-device" {{.*}}.f90"
 
 ! Testing appropriate flags are gnerated and appropriately assigned by the driver when offloading
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OPENMP-OFFLOAD-ARGS
 ! OPENMP-OFFLOAD-ARGS: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu" {{.*}} "-fopenmp" {{.*}}.f90"
 ! OPENMP-OFFLOAD-ARGS-NEXT: "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
@@ -70,19 +70,19 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-threads-oversubscription \
+! RUN: -fopenmp-assume-threads-oversubscription -nogpulib \
 ! RUN: | FileCheck %s --check-prefixes=CHECK-THREADS-OVS
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
 ! RUN: -fopenmp-targets=nvptx64-nvidia-cuda \
-! RUN: -fopenmp-assume-threads-oversubscription \
+! RUN: -fopenmp-assume-threads-oversubscription  \
 ! RUN: | FileCheck %s --check-prefixes=CHECK-THREADS-OVS
 ! CHECK-THREADS-OVS: "{{[^"]*}}flang-new" "-fc1" {{.*}} "-fopenmp" {{.*}} "-fopenmp-is-target-device" "-fopenmp-assume-threads-oversubscription" {{.*}}.f90"
 
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-teams-oversubscription  \
+! RUN: -fopenmp-assume-teams-oversubscription  -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-TEAMS-OVS
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -94,7 +94,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-no-nested-parallelism  \
+! RUN: -fopenmp-assume-no-nested-parallelism  -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-NEST-PAR
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -106,7 +106,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-no-thread-state \
+! RUN: -fopenmp-assume-no-thread-state -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-THREAD-STATE
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -118,7 +118,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-target-debug \
+! RUN: -fopenmp-target-debug -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-TARGET-DEBUG
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -130,7 +130,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-target-debug \
+! RUN: -fopenmp-target-debug -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-TARGET-DEBUG
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -144,7 +144,7 @@
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
 ! RUN: -fopenmp-target-debug -fopenmp-assume-threads-oversubscription \
 ! RUN: -fopenmp-assume-teams-oversubscription -fopenmp-assume-no-nested-parallelism \
-! RUN: -fopenmp-assume-no-thread-state \
+! RUN: -fopenmp-assume-no-thread-state -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-RTL-ALL
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -160,7 +160,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-version=45 \
+! RUN: -fopenmp-version=45 -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-OPENMP-VERSION
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -193,10 +193,17 @@
 ! Test -fopenmp-force-usm option with offload
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp -fopenmp-force-usm --offload-arch=gfx90a \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=FORCE-USM-OFFLOAD
 
 ! FORCE-USM-OFFLOAD: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! FORCE-USM-OFFLOAD-SAME: "-fopenmp" "-fopenmp-force-usm"
 ! FORCE-USM-OFFLOAD-NEXT: "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
 ! FORCE-USM-OFFLOAD-SAME: "-fopenmp" "-fopenmp-force-usm"
+
+! RUN:   %flang -### -v --target=x86_64-unknown-linux-gnu -fopenmp  \
+! RUN:      --offload-arch=gfx900 \
+! RUN:      --rocm-path=%S/Inputs/rocm %s 2>&1 \
+! RUN:   | FileCheck --check-prefix=MLINK-BUILTIN-BITCODE  %s
+! MLINK-BUILTIN-BITCODE:      "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
+! MLINK-BUILTIN-BITCODE-SAME: "-mlink-builtin-bitcode" {{.*Inputs.*rocm.*amdgcn.*bitcode.*}}oclc_isa_version_900.bc
diff --git a/flang/test/Driver/target-cpu-features.f90 b/flang/test/Driver/target-cpu-features.f90
index 0f19e4ebff2a0c..1c77d4ace5fbc1 100644
--- a/flang/test/Driver/target-cpu-features.f90
+++ b/flang/test/Driver/target-cpu-features.f90
@@ -29,10 +29,10 @@
 ! RUN: %flang --target=riscv64-linux-gnu -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-RV64
 
-! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx908 -c %s -### 2>&1 \
+! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx908 -nogpulib -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-AMDGPU
 
-! RUN: %flang --target=r600-unknown-unknown -mcpu=cayman -c %s -### 2>&1 \
+! RUN: %flang --target=r600-unknown-unknown -mcpu=cayman -nogpulib -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-AMDGPU-R600
 
 ! CHECK-A57: "-fc1" "-triple" "aarch64-unknown-linux-gnu"
diff --git a/flang/test/Driver/target-gpu-features.f90 b/flang/test/Driver/target-gpu-features.f90
index 9cc9ce4baaf4d6..b783574370a0f5 100644
--- a/flang/test/Driver/target-gpu-features.f90
+++ b/flang/test/Driver/target-gpu-features.f90
@@ -3,7 +3,7 @@
 ! Test that -mcpu are used and that the -target-cpu and -target-features
 ! are also added to the fc1 command.
 
-! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx902 -c %s -### 2>&1 \
+! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx902  -nogpulib -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-AMDGCN
 
 ! CHECK-AMDGCN: "-fc1" "-triple" "amdgcn-amd-amdhsa"
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 77da994d8dfdf9..4e7b23225f54e6 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -692,6 +692,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_clockrdlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
     libc.src.pthread.pthread_rwlock_rdlock
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 3ec166fc0096c7..04b8b3bc4ce391 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -703,6 +703,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_clockrdlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
     libc.src.pthread.pthread_rwlock_rdlock
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index fe3bd7a757e8fa..fa656d946eceb9 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -790,6 +790,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_clockrdlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
     libc.src.pthread.pthread_rwlock_rdlock
diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst
index 3faae3134ce2a4..9f50545d745d69 100644
--- a/libc/docs/dev/undefined_behavior.rst
+++ b/libc/docs/dev/undefined_behavior.rst
@@ -93,3 +93,8 @@ direction in this case.
 Non-const Constant Return Values
 --------------------------------
 Some libc functions, like ``dlerror()``, return ``char *`` instead of ``const char *`` and then tell the caller they promise not to to modify this value. Any modification of this value is undefined behavior.
+
+Unrecognized ``clockid_t`` values for ``pthread_rwlock_clock*`` APIs
+----------------------------------------------------------------------
+POSIX.1-2024 only demands support for ``CLOCK_REALTIME`` and ``CLOCK_MONOTONIC``. Currently,
+as in LLVM libc, if other clock ids are used, they will be treated as monotonic clocks.
diff --git a/libc/fuzzing/math/RemQuoDiff.h b/libc/fuzzing/math/RemQuoDiff.h
index 84a6a24ce52719..cfb9b7f44e5488 100644
--- a/libc/fuzzing/math/RemQuoDiff.h
+++ b/libc/fuzzing/math/RemQuoDiff.h
@@ -31,21 +31,22 @@ void RemQuoDiff(RemQuoFunc<T> func1, RemQuoFunc<T> func2, const uint8_t *data,
   T remainder1 = func1(x, y, &q1);
   T remainder2 = func2(x, y, &q2);
 
-  if (isnan(remainder1)) {
-    if (!isnan(remainder2))
+  LIBC_NAMESPACE::fputil::FPBits<T> bits1(remainder1);
+  LIBC_NAMESPACE::fputil::FPBits<T> bits2(remainder2);
+
+  if (bit1.is_nan()) {
+    if (!bit2.is_nan())
       __builtin_trap();
     return;
   }
 
-  if (isinf(remainder2) != isinf(remainder1))
+  if (bit1.is_inf() != bit2.is_inf())
     __builtin_trap();
 
   // Compare only the 3 LS bits of the quotient.
   if ((q1 & 0x7) != (q2 & 0x7))
     __builtin_trap();
 
-  LIBC_NAMESPACE::fputil::FPBits<T> bits1(remainder1);
-  LIBC_NAMESPACE::fputil::FPBits<T> bits2(remainder2);
   if (bits1.uintval() != bits2.uintval())
     __builtin_trap();
 }
diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index c158162ba6238e..503b55978e2cb1 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -118,9 +118,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
       __builtin_trap();
     // If any result is NaN, all of them should be NaN. We can't use the usual
     // comparisons because NaN != NaN.
-    if (isnan(float_result) ^ isnan(strtof_result))
+    if (FPBits<float>(float_result).is_nan() !=
+        FPBits<float>(strtof_result).is_nan())
       __builtin_trap();
-    if (!isnan(float_result) && float_result != strtof_result)
+    if (!FPBits<float>(float_result).is_nan() && float_result != strtof_result)
       __builtin_trap();
     mpfr_clear(mpfr_float);
   }
@@ -136,10 +137,12 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     ptrdiff_t strtod_strlen = out_ptr - str_ptr;
     if (result_strlen != strtod_strlen)
       __builtin_trap();
-    if (isnan(double_result) ^ isnan(strtod_result) ||
-        isnan(double_result) ^ isnan(atof_result))
+    if (FPBits<double>(double_result).is_nan() !=
+            FPBits<double>(strtod_result).is_nan() ||
+        FPBits<double>(double_result).is_nan() !=
+            FPBits<double>(atof_result).is_nan())
       __builtin_trap();
-    if (!isnan(double_result) &&
+    if (!FPBits<double>(double_result).is_nan() &&
         (double_result != strtod_result || double_result != atof_result))
       __builtin_trap();
     mpfr_clear(mpfr_double);
@@ -156,9 +159,11 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     ptrdiff_t strtold_strlen = out_ptr - str_ptr;
     if (result_strlen != strtold_strlen)
       __builtin_trap();
-    if (isnan(long_double_result) ^ isnan(strtold_result))
+    if (FPBits<long double>(long_double_result).is_nan() ^
+        FPBits<long double>(strtold_result).is_nan())
       __builtin_trap();
-    if (!isnan(long_double_result) && long_double_result != strtold_result)
+    if (!FPBits<long double>(long_double_result).is_nan() &&
+        long_double_result != strtold_result)
       __builtin_trap();
     mpfr_clear(mpfr_long_double);
   }
diff --git a/libc/newhdrgen/yaml/pthread.yaml b/libc/newhdrgen/yaml/pthread.yaml
index 292d91751e4068..d492dfcbd51eba 100644
--- a/libc/newhdrgen/yaml/pthread.yaml
+++ b/libc/newhdrgen/yaml/pthread.yaml
@@ -370,6 +370,20 @@ functions:
     arguments:
       - type: pthread_rwlock_t *__restrict
       - type: const struct timespec *__restrict
+  - name: pthread_rwlock_clockrdlock
+    standards: POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlock_t *__restrict
+      - type: clockid_t
+      - type: const struct timespec *__restrict
+  - name: pthread_rwlock_clockwrlock
+    standards: POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlock_t *__restrict
+      - type: clockid_t
+      - type: const struct timespec *__restrict
   - name: pthread_rwlock_rdlock
     standards: POSIX
     return_type: int
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index 1878b1ee2ae412..1b7e18e9996007 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -1325,6 +1325,16 @@ def POSIX : StandardSpec<"POSIX"> {
         RetValSpec<IntType>,
         [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
       >,
+      FunctionSpec<
+        "pthread_rwlock_clockrdlock",
+        RetValSpec<IntType>,
+        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ClockIdT>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
+      >,
+      FunctionSpec<
+        "pthread_rwlock_clockwrlock",
+        RetValSpec<IntType>,
+        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ClockIdT>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
+      >,
       FunctionSpec<
         "pthread_rwlock_rdlock",
         RetValSpec<IntType>,
diff --git a/libc/src/compiler/generic/__stack_chk_fail.cpp b/libc/src/compiler/generic/__stack_chk_fail.cpp
index 639204d29590ab..c76ec1407ad356 100644
--- a/libc/src/compiler/generic/__stack_chk_fail.cpp
+++ b/libc/src/compiler/generic/__stack_chk_fail.cpp
@@ -13,7 +13,7 @@
 extern "C" {
 
 void __stack_chk_fail(void) {
-  LIBC_NAMESPACE::write_to_stderr("stack smashing detected");
+  LIBC_NAMESPACE::write_to_stderr("stack smashing detected\n");
   LIBC_NAMESPACE::abort();
 }
 
diff --git a/libc/src/pthread/CMakeLists.txt b/libc/src/pthread/CMakeLists.txt
index dc748b22e03787..70d10e6c4e3f88 100644
--- a/libc/src/pthread/CMakeLists.txt
+++ b/libc/src/pthread/CMakeLists.txt
@@ -556,6 +556,28 @@ add_entrypoint_object(
     libc.src.__support.threads.linux.rwlock
 )
 
+add_entrypoint_object(
+  pthread_rwlock_clockrdlock
+  SRCS
+    pthread_rwlock_clockrdlock.cpp
+  HDRS
+    pthread_rwlock_clockrdlock.h
+  DEPENDS
+    libc.include.pthread
+    libc.src.__support.threads.linux.rwlock
+)
+
+add_entrypoint_object(
+  pthread_rwlock_clockwrlock
+  SRCS
+    pthread_rwlock_clockwrlock.cpp
+  HDRS
+    pthread_rwlock_clockwrlock.h
+  DEPENDS
+    libc.include.pthread
+    libc.src.__support.threads.linux.rwlock
+)
+
 add_entrypoint_object(
   pthread_rwlock_timedrdlock
   SRCS
diff --git a/libc/src/pthread/pthread_rwlock_clockrdlock.cpp b/libc/src/pthread/pthread_rwlock_clockrdlock.cpp
new file mode 100644
index 00000000000000..1e44e6d7694f63
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockrdlock.cpp
@@ -0,0 +1,50 @@
+//===-- Implementation of the Rwlock's clockrdlock function ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/pthread/pthread_rwlock_clockrdlock.h"
+
+#include "hdr/errno_macros.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/linux/rwlock.h"
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+static_assert(
+    sizeof(RwLock) == sizeof(pthread_rwlock_t) &&
+        alignof(RwLock) == alignof(pthread_rwlock_t),
+    "The public pthread_rwlock_t type must be of the same size and alignment "
+    "as the internal rwlock type.");
+
+LLVM_LIBC_FUNCTION(int, pthread_rwlock_clockrdlock,
+                   (pthread_rwlock_t * rwlock, clockid_t clockid,
+                    const timespec *abstime)) {
+  if (!rwlock)
+    return EINVAL;
+  if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME)
+    return EINVAL;
+  bool is_realtime = (clockid == CLOCK_REALTIME);
+  RwLock *rw = reinterpret_cast<RwLock *>(rwlock);
+  LIBC_ASSERT(abstime && "clockrdlock called with a null timeout");
+  auto timeout = internal::AbsTimeout::from_timespec(
+      *abstime, /*is_realtime=*/is_realtime);
+  if (LIBC_LIKELY(timeout.has_value()))
+    return static_cast<int>(rw->read_lock(timeout.value()));
+
+  switch (timeout.error()) {
+  case internal::AbsTimeout::Error::Invalid:
+    return EINVAL;
+  case internal::AbsTimeout::Error::BeforeEpoch:
+    return ETIMEDOUT;
+  }
+  __builtin_unreachable();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_rwlock_clockrdlock.h b/libc/src/pthread/pthread_rwlock_clockrdlock.h
new file mode 100644
index 00000000000000..8fbd3b089a1111
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockrdlock.h
@@ -0,0 +1,23 @@
+//===-- Implementation header for Rwlock's clockrdlock function --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKRDLOCK_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKRDLOCK_H
+
+#include "src/__support/macros/config.h"
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_rwlock_clockrdlock(pthread_rwlock_t *__restrict rwlock,
+                               clockid_t clockid,
+                               const timespec *__restrict abstime);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKRDLOCK_H
diff --git a/libc/src/pthread/pthread_rwlock_clockwrlock.cpp b/libc/src/pthread/pthread_rwlock_clockwrlock.cpp
new file mode 100644
index 00000000000000..8f58c7f24bb10a
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockwrlock.cpp
@@ -0,0 +1,51 @@
+//===-- Implementation of the Rwlock's clockwrlock function----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/pthread/pthread_rwlock_clockwrlock.h"
+
+#include "hdr/errno_macros.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/linux/rwlock.h"
+#include "src/__support/time/linux/abs_timeout.h"
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+static_assert(
+    sizeof(RwLock) == sizeof(pthread_rwlock_t) &&
+        alignof(RwLock) == alignof(pthread_rwlock_t),
+    "The public pthread_rwlock_t type must be of the same size and alignment "
+    "as the internal rwlock type.");
+
+LLVM_LIBC_FUNCTION(int, pthread_rwlock_clockwrlock,
+                   (pthread_rwlock_t * rwlock, clockid_t clockid,
+                    const timespec *abstime)) {
+  if (!rwlock)
+    return EINVAL;
+  if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME)
+    return EINVAL;
+  bool is_realtime = (clockid == CLOCK_REALTIME);
+  RwLock *rw = reinterpret_cast<RwLock *>(rwlock);
+  LIBC_ASSERT(abstime && "clockwrlock called with a null timeout");
+  auto timeout = internal::AbsTimeout::from_timespec(
+      *abstime, /*is_realtime=*/is_realtime);
+  if (LIBC_LIKELY(timeout.has_value()))
+    return static_cast<int>(rw->write_lock(timeout.value()));
+
+  switch (timeout.error()) {
+  case internal::AbsTimeout::Error::Invalid:
+    return EINVAL;
+  case internal::AbsTimeout::Error::BeforeEpoch:
+    return ETIMEDOUT;
+  }
+  __builtin_unreachable();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_rwlock_clockwrlock.h b/libc/src/pthread/pthread_rwlock_clockwrlock.h
new file mode 100644
index 00000000000000..cb3fa3909fd2be
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockwrlock.h
@@ -0,0 +1,23 @@
+//===-- Implementation header for Rwlock's clockwrlock function --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKWRLOCK_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKWRLOCK_H
+
+#include "src/__support/macros/config.h"
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_rwlock_clockwrlock(pthread_rwlock_t *__restrict rwlock,
+                               clockid_t clockid,
+                               const timespec *__restrict abstime);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKWRLOCK_H
diff --git a/libc/test/integration/src/pthread/CMakeLists.txt b/libc/test/integration/src/pthread/CMakeLists.txt
index fa5fd3ad55d5ff..eb26822597c2f2 100644
--- a/libc/test/integration/src/pthread/CMakeLists.txt
+++ b/libc/test/integration/src/pthread/CMakeLists.txt
@@ -32,9 +32,11 @@ add_integration_test(
     libc.src.pthread.pthread_rwlock_rdlock
     libc.src.pthread.pthread_rwlock_tryrdlock
     libc.src.pthread.pthread_rwlock_timedrdlock
+    libc.src.pthread.pthread_rwlock_clockrdlock
     libc.src.pthread.pthread_rwlock_wrlock
     libc.src.pthread.pthread_rwlock_trywrlock
     libc.src.pthread.pthread_rwlock_timedwrlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_unlock
     libc.src.pthread.pthread_create
     libc.src.pthread.pthread_join
diff --git a/libc/test/integration/src/pthread/pthread_rwlock_test.cpp b/libc/test/integration/src/pthread/pthread_rwlock_test.cpp
index 455003b6af8118..9f5fba187713eb 100644
--- a/libc/test/integration/src/pthread/pthread_rwlock_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_rwlock_test.cpp
@@ -15,6 +15,8 @@
 #include "src/__support/threads/sleep.h"
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_join.h"
+#include "src/pthread/pthread_rwlock_clockrdlock.h"
+#include "src/pthread/pthread_rwlock_clockwrlock.h"
 #include "src/pthread/pthread_rwlock_destroy.h"
 #include "src/pthread/pthread_rwlock_init.h"
 #include "src/pthread/pthread_rwlock_rdlock.h"
@@ -112,6 +114,12 @@ static void nullptr_test() {
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_wrlock(nullptr), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(nullptr, &ts), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(nullptr, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(nullptr, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(nullptr, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_tryrdlock(nullptr), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_trywrlock(nullptr), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(nullptr), EINVAL);
@@ -159,16 +167,40 @@ static void unusual_timespec_test() {
   timespec ts = {0, -1};
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ts.tv_nsec = 1'000'000'000;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ts.tv_nsec += 1;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ts.tv_nsec = 0;
   ts.tv_sec = -1;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts),
             ETIMEDOUT);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts),
             ETIMEDOUT);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      ETIMEDOUT);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      ETIMEDOUT);
 }
 
 static void timedlock_with_deadlock_test() {
@@ -184,6 +216,13 @@ static void timedlock_with_deadlock_test() {
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts),
             ETIMEDOUT);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts), 0);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_REALTIME, &ts),
+      ETIMEDOUT);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_REALTIME, &ts),
+      0);
+  ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(&rwlock), 0);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(&rwlock), 0);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(&rwlock), 0);
   // notice that ts is already expired, but the following should still succeed.
@@ -270,9 +309,11 @@ enum class Operation : int {
   WRITE = 1,
   TIMED_READ = 2,
   TIMED_WRITE = 3,
-  TRY_READ = 4,
-  TRY_WRITE = 5,
-  COUNT = 6
+  CLOCK_READ = 4,
+  CLOCK_WRITE = 5,
+  TRY_READ = 6,
+  TRY_WRITE = 7,
+  COUNT = 8
 };
 
 LIBC_NAMESPACE::RawMutex *io_mutex;
@@ -358,6 +399,24 @@ static void randomized_thread_operation(SharedData *data, ThreadGuard &guard) {
     }
     break;
   }
+  case Operation::CLOCK_READ: {
+    timespec ts = get_ts();
+    if (LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&data->lock, CLOCK_MONOTONIC,
+                                                   &ts) == 0) {
+      read_ops();
+      LIBC_NAMESPACE::pthread_rwlock_unlock(&data->lock);
+    }
+    break;
+  }
+  case Operation::CLOCK_WRITE: {
+    timespec ts = get_ts();
+    if (LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&data->lock, CLOCK_MONOTONIC,
+                                                   &ts) == 0) {
+      write_ops();
+      LIBC_NAMESPACE::pthread_rwlock_unlock(&data->lock);
+    }
+    break;
+  }
   case Operation::TRY_READ: {
     if (LIBC_NAMESPACE::pthread_rwlock_tryrdlock(&data->lock) == 0) {
       read_ops();
diff --git a/libc/test/src/math/cbrt_test.cpp b/libc/test/src/math/cbrt_test.cpp
index 123351496118b2..2ef2140966f52c 100644
--- a/libc/test/src/math/cbrt_test.cpp
+++ b/libc/test/src/math/cbrt_test.cpp
@@ -21,8 +21,8 @@ using LIBC_NAMESPACE::testing::tlog;
 
 TEST_F(LlvmLibcCbrtTest, InDoubleRange) {
   constexpr uint64_t COUNT = 123'451;
-  uint64_t START = LIBC_NAMESPACE::fputil::FPBits<double>(1.0).uintval();
-  uint64_t STOP = LIBC_NAMESPACE::fputil::FPBits<double>(8.0).uintval();
+  uint64_t START = FPBits(1.0).uintval();
+  uint64_t STOP = FPBits(8.0).uintval();
   uint64_t STEP = (STOP - START) / COUNT;
 
   auto test = [&](mpfr::RoundingMode rounding_mode) {
@@ -38,12 +38,12 @@ TEST_F(LlvmLibcCbrtTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(x).is_inf_or_nan())
         continue;
 
       double result = LIBC_NAMESPACE::cbrt(x);
       ++total;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_inf_or_nan())
         continue;
 
       ++tested;
diff --git a/libc/test/utils/FPUtil/x86_long_double_test.cpp b/libc/test/utils/FPUtil/x86_long_double_test.cpp
index 87796b5c9f5ba5..8d16869b2e8163 100644
--- a/libc/test/utils/FPUtil/x86_long_double_test.cpp
+++ b/libc/test/utils/FPUtil/x86_long_double_test.cpp
@@ -27,8 +27,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // If exponent has the max value and the implicit bit is 0,
     // then the number is a NaN for all values of mantissa.
     bits.set_mantissa(i);
-    long double nan = bits.get_val();
-    ASSERT_NE(static_cast<int>(isnan(nan)), 0);
     ASSERT_TRUE(bits.is_nan());
   }
 
@@ -38,8 +36,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // then the number is a NaN for all non-zero values of mantissa.
     // Note the initial value of |i| of 1 to avoid a zero mantissa.
     bits.set_mantissa(i);
-    long double nan = bits.get_val();
-    ASSERT_NE(static_cast<int>(isnan(nan)), 0);
     ASSERT_TRUE(bits.is_nan());
   }
 
@@ -49,8 +45,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // If exponent is non-zero and also not max, and the implicit bit is 0,
     // then the number is a NaN for all values of mantissa.
     bits.set_mantissa(i);
-    long double nan = bits.get_val();
-    ASSERT_NE(static_cast<int>(isnan(nan)), 0);
     ASSERT_TRUE(bits.is_nan());
   }
 
@@ -60,8 +54,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // If exponent is non-zero and also not max, and the implicit bit is 1,
     // then the number is normal value for all values of mantissa.
     bits.set_mantissa(i);
-    long double valid = bits.get_val();
-    ASSERT_EQ(static_cast<int>(isnan(valid)), 0);
     ASSERT_FALSE(bits.is_nan());
   }
 
@@ -70,8 +62,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
   for (unsigned int i = 0; i < COUNT; ++i) {
     // If exponent is zero, then the number is a valid but denormal value.
     bits.set_mantissa(i);
-    long double valid = bits.get_val();
-    ASSERT_EQ(static_cast<int>(isnan(valid)), 0);
     ASSERT_FALSE(bits.is_nan());
   }
 
@@ -80,8 +70,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
   for (unsigned int i = 0; i < COUNT; ++i) {
     // If exponent is zero, then the number is a valid but denormal value.
     bits.set_mantissa(i);
-    long double valid = bits.get_val();
-    ASSERT_EQ(static_cast<int>(isnan(valid)), 0);
     ASSERT_FALSE(bits.is_nan());
   }
 }
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 0173be396163eb..6abd929d2343d4 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -27,6 +27,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/TarWriter.h"
 #include <atomic>
 #include <memory>
 #include <optional>
@@ -458,6 +459,15 @@ struct ConfigWrapper {
 
 LLVM_LIBRARY_VISIBILITY extern ConfigWrapper config;
 
+// Some index properties of a symbol are stored separately in this auxiliary
+// struct to decrease sizeof(SymbolUnion) in the majority of cases.
+struct SymbolAux {
+  uint32_t gotIdx = -1;
+  uint32_t pltIdx = -1;
+  uint32_t tlsDescIdx = -1;
+  uint32_t tlsGdIdx = -1;
+};
+
 struct DuplicateSymbol {
   const Symbol *sym;
   const InputFile *file;
@@ -475,6 +485,8 @@ struct Ctx {
   SmallVector<BitcodeFile *, 0> lazyBitcodeFiles;
   SmallVector<InputSectionBase *, 0> inputSections;
   SmallVector<EhInputSection *, 0> ehInputSections;
+
+  SmallVector<SymbolAux, 0> symAux;
   // Duplicate symbol candidates.
   SmallVector<DuplicateSymbol, 0> duplicates;
   // Symbols in a non-prevailing COMDAT group which should be changed to an
@@ -489,6 +501,9 @@ struct Ctx {
                  std::pair<const InputFile *, const InputFile *>>
       backwardReferences;
   llvm::SmallSet<llvm::StringRef, 0> auxiliaryFiles;
+  // If --reproduce is specified, all input files are written to this tar
+  // archive.
+  std::unique_ptr<llvm::TarWriter> tar;
   // InputFile for linker created symbols with no source location.
   InputFile *internalFile;
   // True if SHT_LLVM_SYMPART is used.
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index afee175a659044..86c8afc3799438 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -101,11 +101,14 @@ void Ctx::reset() {
   lazyBitcodeFiles.clear();
   inputSections.clear();
   ehInputSections.clear();
+
+  symAux.clear();
   duplicates.clear();
   nonPrevailingSyms.clear();
   whyExtractRecords.clear();
   backwardReferences.clear();
   auxiliaryFiles.clear();
+  tar.reset();
   internalFile = nullptr;
   hasSympart.store(false, std::memory_order_relaxed);
   hasTlsIe.store(false, std::memory_order_relaxed);
@@ -136,9 +139,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
     symtab = SymbolTable();
 
     outputSections.clear();
-    symAux.clear();
 
-    tar = nullptr;
     in.reset();
 
     partitions.clear();
@@ -153,7 +154,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
   config = ConfigWrapper();
   script = ScriptWrapper();
 
-  symAux.emplace_back();
+  elf::ctx.symAux.emplace_back();
 
   partitions.clear();
   partitions.emplace_back();
@@ -224,14 +225,15 @@ std::vector<std::pair<MemoryBufferRef, uint64_t>> static getArchiveMembers(
 
   std::vector<std::pair<MemoryBufferRef, uint64_t>> v;
   Error err = Error::success();
-  bool addToTar = file->isThin() && tar;
+  bool addToTar = file->isThin() && ctx.tar;
   for (const Archive::Child &c : file->children(err)) {
     MemoryBufferRef mbref =
         CHECK(c.getMemoryBufferRef(),
               mb.getBufferIdentifier() +
                   ": could not get the buffer for a child of the archive");
     if (addToTar)
-      tar->append(relativeToRoot(check(c.getFullName())), mbref.getBuffer());
+      ctx.tar->append(relativeToRoot(check(c.getFullName())),
+                      mbref.getBuffer());
     v.push_back(std::make_pair(mbref, c.getChildOffset()));
   }
   if (err)
@@ -640,9 +642,9 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     Expected<std::unique_ptr<TarWriter>> errOrWriter =
         TarWriter::create(path, path::stem(path));
     if (errOrWriter) {
-      tar = std::move(*errOrWriter);
-      tar->append("response.txt", createResponseFile(args));
-      tar->append("version.txt", getLLDVersion() + "\n");
+      ctx.tar = std::move(*errOrWriter);
+      ctx.tar->append("response.txt", createResponseFile(args));
+      ctx.tar->append("version.txt", getLLDVersion() + "\n");
       StringRef ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile);
       if (!ltoSampleProfile.empty())
         readFile(ltoSampleProfile);
@@ -1911,13 +1913,7 @@ void LinkerDriver::createFiles(opt::InputArgList &args) {
       hasInput = true;
       break;
     case OPT_defsym: {
-      StringRef from;
-      StringRef to;
-      std::tie(from, to) = StringRef(arg->getValue()).split('=');
-      if (from.empty() || to.empty())
-        error("--defsym: syntax error: " + StringRef(arg->getValue()));
-      else
-        readDefsym(from, MemoryBufferRef(to, "--defsym"));
+      readDefsym(MemoryBufferRef(arg->getValue(), "--defsym"));
       break;
     }
     case OPT_script:
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index b67250393812de..738de4499f38ff 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RISCVAttributeParser.h"
-#include "llvm/Support/TarWriter.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
@@ -52,8 +51,6 @@ extern template void ObjFile<ELF64BE>::importCmseSymbols();
 bool InputFile::isInGroup;
 uint32_t InputFile::nextGroupId;
 
-std::unique_ptr<TarWriter> elf::tar;
-
 // Returns "<internal>", "foo.a(bar.o)" or "baz.o".
 std::string lld::toString(const InputFile *f) {
   static std::mutex mu;
@@ -261,8 +258,8 @@ std::optional<MemoryBufferRef> elf::readFile(StringRef path) {
   MemoryBufferRef mbref = (*mbOrErr)->getMemBufferRef();
   ctx.memoryBuffers.push_back(std::move(*mbOrErr)); // take MB ownership
 
-  if (tar)
-    tar->append(relativeToRoot(path), mbref.getBuffer());
+  if (ctx.tar)
+    ctx.tar->append(relativeToRoot(path), mbref.getBuffer());
   return mbref;
 }
 
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 73469f836e79da..3248a1df116bdc 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -39,9 +39,6 @@ namespace elf {
 class InputSection;
 class Symbol;
 
-// If --reproduce is specified, all input files are written to this tar archive.
-extern std::unique_ptr<llvm::TarWriter> tar;
-
 // Opens a given file.
 std::optional<MemoryBufferRef> readFile(StringRef path);
 
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 9a799cd286135f..94ad7b35708203 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1716,7 +1716,7 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) {
   auto &dyn = config->androidPackDynRelocs ? *in.relaPlt : *mainPart->relaDyn;
   addPltEntry(*in.iplt, *in.igotPlt, dyn, target->iRelativeRel, *directSym);
   sym.allocateAux();
-  symAux.back().pltIdx = symAux[directSym->auxIdx].pltIdx;
+  ctx.symAux.back().pltIdx = ctx.symAux[directSym->auxIdx].pltIdx;
 
   if (flags & HAS_DIRECT_RELOC) {
     // Change the value to the IPLT and redirect all references to it.
@@ -1834,7 +1834,7 @@ void elf::postScanRelocations() {
           {R_ADDEND, target->symbolicRel, got->getTlsIndexOff(), 1, &dummy});
   }
 
-  assert(symAux.size() == 1);
+  assert(ctx.symAux.size() == 1);
   for (Symbol *sym : symtab.getSymbols())
     fn(*sym);
 
diff --git a/lld/ELF/ScriptLexer.cpp b/lld/ELF/ScriptLexer.cpp
index 9d31b2b226517d..40528f0cd7f611 100644
--- a/lld/ELF/ScriptLexer.cpp
+++ b/lld/ELF/ScriptLexer.cpp
@@ -27,15 +27,32 @@
 //===----------------------------------------------------------------------===//
 
 #include "ScriptLexer.h"
+#include "Config.h"
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include <algorithm>
 
 using namespace llvm;
 using namespace lld;
 using namespace lld::elf;
 
+ScriptLexer::Buffer::Buffer(MemoryBufferRef mb)
+    : s(mb.getBuffer()), filename(mb.getBufferIdentifier()),
+      begin(mb.getBufferStart()) {
+  if (config->sysroot == "")
+    return;
+  StringRef path = filename;
+  for (; !path.empty(); path = sys::path::parent_path(path)) {
+    if (!sys::fs::equivalent(config->sysroot, path))
+      continue;
+    isUnderSysroot = true;
+    return;
+  }
+}
+
 ScriptLexer::ScriptLexer(MemoryBufferRef mb) : curBuf(mb), mbs(1, mb) {
   activeFilenames.insert(mb.getBufferIdentifier());
 }
diff --git a/lld/ELF/ScriptLexer.h b/lld/ELF/ScriptLexer.h
index 0c12b984c9e1b8..ffd84411f22ac7 100644
--- a/lld/ELF/ScriptLexer.h
+++ b/lld/ELF/ScriptLexer.h
@@ -25,10 +25,12 @@ class ScriptLexer {
     StringRef s, filename;
     const char *begin = nullptr;
     size_t lineNumber = 1;
+    // True if the script is opened as an absolute path under the --sysroot
+    // directory.
+    bool isUnderSysroot = false;
+
     Buffer() = default;
-    Buffer(MemoryBufferRef mb)
-        : s(mb.getBuffer()), filename(mb.getBufferIdentifier()),
-          begin(mb.getBufferStart()) {}
+    Buffer(MemoryBufferRef mb);
   };
   // The current buffer and parent buffers due to INCLUDE.
   Buffer curBuf;
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index c0337338770936..107b4c6e7af067 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -48,22 +48,12 @@ namespace {
 class ScriptParser final : ScriptLexer {
 public:
   ScriptParser(MemoryBufferRef mb) : ScriptLexer(mb) {
-    // Initialize IsUnderSysroot
-    if (config->sysroot == "")
-      return;
-    StringRef path = mb.getBufferIdentifier();
-    for (; !path.empty(); path = sys::path::parent_path(path)) {
-      if (!sys::fs::equivalent(config->sysroot, path))
-        continue;
-      isUnderSysroot = true;
-      return;
-    }
   }
 
   void readLinkerScript();
   void readVersionScript();
   void readDynamicList();
-  void readDefsym(StringRef name);
+  void readDefsym();
 
 private:
   void addFile(StringRef path);
@@ -135,9 +125,6 @@ class ScriptParser final : ScriptLexer {
   std::pair<SmallVector<SymbolVersion, 0>, SmallVector<SymbolVersion, 0>>
   readSymbols();
 
-  // True if a script being read is in the --sysroot directory.
-  bool isUnderSysroot = false;
-
   // If we are currently parsing a PROVIDE|PROVIDE_HIDDEN command,
   // then this member is set to the PROVIDE symbol name.
   std::optional<llvm::StringRef> activeProvideSym;
@@ -296,9 +283,12 @@ void ScriptParser::readLinkerScript() {
   }
 }
 
-void ScriptParser::readDefsym(StringRef name) {
+void ScriptParser::readDefsym() {
   if (errorCount())
     return;
+  inExpr = true;
+  StringRef name = readName();
+  expect("=");
   Expr e = readExpr();
   if (!atEOF())
     setError("EOF expected, but got " + next());
@@ -319,7 +309,7 @@ void ScriptParser::readNoCrossRefs(bool to) {
 }
 
 void ScriptParser::addFile(StringRef s) {
-  if (isUnderSysroot && s.starts_with("/")) {
+  if (curBuf.isUnderSysroot && s.starts_with("/")) {
     SmallString<128> pathData;
     StringRef path = (config->sysroot + s).toStringRef(pathData);
     if (sys::fs::exists(path))
@@ -1867,7 +1857,4 @@ void elf::readDynamicList(MemoryBufferRef mb) {
   ScriptParser(mb).readDynamicList();
 }
 
-void elf::readDefsym(StringRef name, MemoryBufferRef mb) {
-  llvm::TimeTraceScope timeScope("Read defsym input", name);
-  ScriptParser(mb).readDefsym(name);
-}
+void elf::readDefsym(MemoryBufferRef mb) { ScriptParser(mb).readDefsym(); }
diff --git a/lld/ELF/ScriptParser.h b/lld/ELF/ScriptParser.h
index 34b27d2b4787c1..d6f71c5ee63291 100644
--- a/lld/ELF/ScriptParser.h
+++ b/lld/ELF/ScriptParser.h
@@ -24,7 +24,7 @@ void readVersionScript(MemoryBufferRef mb);
 void readDynamicList(MemoryBufferRef mb);
 
 // Parses the defsym expression.
-void readDefsym(StringRef name, MemoryBufferRef mb);
+void readDefsym(MemoryBufferRef mb);
 
 bool hasWildcard(StringRef s);
 
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 93653def328f82..263d4f35c5b9a2 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -73,7 +73,6 @@ Defined *ElfSym::riscvGlobalPointer;
 Defined *ElfSym::relaIpltStart;
 Defined *ElfSym::relaIpltEnd;
 Defined *ElfSym::tlsModuleBase;
-SmallVector<SymbolAux, 0> elf::symAux;
 
 static uint64_t getSymVA(const Symbol &sym, int64_t addend) {
   switch (sym.kind()) {
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index e764fe8d736330..ff825615658ebb 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -56,17 +56,6 @@ enum {
   NEEDS_TLSIE = 1 << 8,
 };
 
-// Some index properties of a symbol are stored separately in this auxiliary
-// struct to decrease sizeof(SymbolUnion) in the majority of cases.
-struct SymbolAux {
-  uint32_t gotIdx = -1;
-  uint32_t pltIdx = -1;
-  uint32_t tlsDescIdx = -1;
-  uint32_t tlsGdIdx = -1;
-};
-
-LLVM_LIBRARY_VISIBILITY extern SmallVector<SymbolAux, 0> symAux;
-
 // The base class for real symbol classes.
 class Symbol {
 public:
@@ -211,10 +200,10 @@ class Symbol {
   // truncated by Symbol::parseSymbolVersion().
   const char *getVersionSuffix() const { return nameData + nameSize; }
 
-  uint32_t getGotIdx() const { return symAux[auxIdx].gotIdx; }
-  uint32_t getPltIdx() const { return symAux[auxIdx].pltIdx; }
-  uint32_t getTlsDescIdx() const { return symAux[auxIdx].tlsDescIdx; }
-  uint32_t getTlsGdIdx() const { return symAux[auxIdx].tlsGdIdx; }
+  uint32_t getGotIdx() const { return ctx.symAux[auxIdx].gotIdx; }
+  uint32_t getPltIdx() const { return ctx.symAux[auxIdx].pltIdx; }
+  uint32_t getTlsDescIdx() const { return ctx.symAux[auxIdx].tlsDescIdx; }
+  uint32_t getTlsGdIdx() const { return ctx.symAux[auxIdx].tlsGdIdx; }
 
   bool isInGot() const { return getGotIdx() != uint32_t(-1); }
   bool isInPlt() const { return getPltIdx() != uint32_t(-1); }
@@ -325,8 +314,8 @@ class Symbol {
   // entries during postScanRelocations();
   std::atomic<uint16_t> flags;
 
-  // A symAux index used to access GOT/PLT entry indexes. This is allocated in
-  // postScanRelocations().
+  // A ctx.symAux index used to access GOT/PLT entry indexes. This is allocated
+  // in postScanRelocations().
   uint32_t auxIdx;
   uint32_t dynsymIndex;
 
@@ -357,8 +346,8 @@ class Symbol {
   }
   void allocateAux() {
     assert(auxIdx == 0);
-    auxIdx = symAux.size();
-    symAux.emplace_back();
+    auxIdx = ctx.symAux.size();
+    ctx.symAux.emplace_back();
   }
 
   bool isSection() const { return type == llvm::ELF::STT_SECTION; }
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index b40ff0bc3cb031..b767392c4456cf 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -653,20 +653,20 @@ GotSection::GotSection()
 
 void GotSection::addConstant(const Relocation &r) { relocations.push_back(r); }
 void GotSection::addEntry(const Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().gotIdx = numEntries++;
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().gotIdx = numEntries++;
 }
 
 bool GotSection::addTlsDescEntry(const Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().tlsDescIdx = numEntries;
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().tlsDescIdx = numEntries;
   numEntries += 2;
   return true;
 }
 
 bool GotSection::addDynTlsEntry(const Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().tlsGdIdx = numEntries;
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().tlsGdIdx = numEntries;
   // Global Dynamic TLS entries take two GOT slots.
   numEntries += 2;
   return true;
@@ -997,12 +997,12 @@ void MipsGotSection::build() {
   for (auto &p : primGot->global) {
     if (p.first->auxIdx == 0)
       p.first->allocateAux();
-    symAux.back().gotIdx = p.second;
+    ctx.symAux.back().gotIdx = p.second;
   }
   for (auto &p : primGot->relocs) {
     if (p.first->auxIdx == 0)
       p.first->allocateAux();
-    symAux.back().gotIdx = p.second;
+    ctx.symAux.back().gotIdx = p.second;
   }
 
   // Create dynamic relocations.
@@ -1171,8 +1171,8 @@ GotPltSection::GotPltSection()
 }
 
 void GotPltSection::addEntry(Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1 &&
-         symAux.back().pltIdx == entries.size());
+  assert(sym.auxIdx == ctx.symAux.size() - 1 &&
+         ctx.symAux.back().pltIdx == entries.size());
   entries.push_back(&sym);
 }
 
@@ -1217,7 +1217,7 @@ IgotPltSection::IgotPltSection()
                        target->gotEntrySize, getIgotPltName()) {}
 
 void IgotPltSection::addEntry(Symbol &sym) {
-  assert(symAux.back().pltIdx == entries.size());
+  assert(ctx.symAux.back().pltIdx == entries.size());
   entries.push_back(&sym);
 }
 
@@ -2566,8 +2566,8 @@ void PltSection::writeTo(uint8_t *buf) {
 }
 
 void PltSection::addEntry(Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().pltIdx = entries.size();
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().pltIdx = entries.size();
   entries.push_back(&sym);
 }
 
@@ -2613,8 +2613,8 @@ size_t IpltSection::getSize() const {
 }
 
 void IpltSection::addEntry(Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().pltIdx = entries.size();
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().pltIdx = entries.size();
   entries.push_back(&sym);
 }
 
diff --git a/lld/test/ELF/defsym.s b/lld/test/ELF/defsym.s
index fed937ffc1c9c3..eb409cccfc0338 100644
--- a/lld/test/ELF/defsym.s
+++ b/lld/test/ELF/defsym.s
@@ -5,14 +5,16 @@
 # RUN: llvm-objdump -d --print-imm-hex %t | FileCheck %s --check-prefix=USE
 
 ## Check that we accept --defsym foo2=foo1 form.
-# RUN: ld.lld -o %t2 %t.o --defsym foo2=foo1
+# RUN: ld.lld -o %t2 %t.o --defsym '"foo2"=foo1'
 # RUN: llvm-readelf -s %t2 | FileCheck %s
 # RUN: llvm-objdump -d --print-imm-hex %t2 | FileCheck %s --check-prefix=USE
 
 ## Check we are reporting the error correctly and don't crash
 ## when handling the second --defsym.
-# RUN: not ld.lld -o /dev/null %t.o --defsym ERR+ --defsym foo2=foo1 2>&1 | FileCheck %s --check-prefix=ERR
-# ERR: error: --defsym: syntax error: ERR+
+# RUN: not ld.lld -o /dev/null %t.o --defsym ERR+ --defsym foo2=foo1 2>&1 | FileCheck %s --check-prefix=ERR --strict-whitespace
+#      ERR:error: --defsym:1: = expected, but got +
+# ERR-NEXT:>>> ERR+
+# ERR-NEXT:>>>    ^
 
 # CHECK-DAG: 0000000000000123     0 NOTYPE  GLOBAL DEFAULT   ABS foo1
 # CHECK-DAG: 0000000000000123     0 NOTYPE  GLOBAL DEFAULT   ABS foo2
@@ -41,10 +43,9 @@
 # ERR2: error: --defsym:1: EOF expected, but got ,
 
 # RUN: not ld.lld -o /dev/null %t.o --defsym=foo 2>&1 | FileCheck %s -check-prefix=ERR3
-# ERR3: error: --defsym: syntax error: foo
+# ERR3: error: --defsym:1: unexpected EOF
 
-# RUN: not ld.lld -o /dev/null %t.o --defsym= 2>&1 | FileCheck %s -check-prefix=ERR4
-# ERR4: error: --defsym: syntax error:
+# RUN: not ld.lld -o /dev/null %t.o --defsym= 2>&1 | FileCheck %s -check-prefix=ERR3
 
 .globl foo1
  foo1 = 0x123
diff --git a/lld/test/ELF/linkerscript/group.s b/lld/test/ELF/linkerscript/group.s
index 89b09e529f4660..8cfe1631a8c842 100644
--- a/lld/test/ELF/linkerscript/group.s
+++ b/lld/test/ELF/linkerscript/group.s
@@ -3,62 +3,61 @@
 
 # RUN: rm -rf %t.dir && mkdir %t.dir
 # RUN: rm -rf %t && split-file %s %t && cd %t
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/libsearch-st.s -o b.o
-# RUN: llvm-ar rc %t.dir/libxyz.a b.o
-
-# RUN: echo 'GROUP("a.o")' > %t.t
-# RUN: ld.lld -o %t2 %t.t
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'INPUT("a.o")' > %t.t
-# RUN: ld.lld -o %t2 %t.t
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" libxyz.a )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t 2>/dev/null
-# RUN: ld.lld -o %t2 %t.t -L%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" =libxyz.a )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t  2>/dev/null
-# RUN: ld.lld -o %t2 %t.t --sysroot=%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" -lxyz )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t  2>/dev/null
-# RUN: ld.lld -o %t2 %t.t -L%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" libxyz.a )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t  2>/dev/null
-# RUN: ld.lld -o %t2 %t.t -L%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" /libxyz.a )' > %t.t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/libsearch-st.s -o xyz.o
+# RUN: llvm-ar rc %t.dir/libb.a b.o
+# RUN: llvm-ar rc %t.dir/libxyz.a xyz.o
+
+# RUN: echo 'GROUP("a.o" libxyz.a -lxyz b.o )' > 1.t
+# RUN: not ld.lld 1.t 2>&1 | FileCheck %s --check-prefix=NOLIB
+# RUN: ld.lld 1.t -L%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+
+# RUN: echo 'GROUP( "a.o" b.o =libxyz.a )' > 2.t
+# RUN: not ld.lld 2.t 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=libxyz.a
+# RUN: ld.lld 2.t --sysroot=%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+
+# RUN: echo 'GROUP("%t.dir/3a.t")' > 3.t
+# RUN: echo 'INCLUDE "%t.dir/3a.t"' > 3i.t
+# RUN: echo 'GROUP(AS_NEEDED("a.o"))INPUT(/libb.a)' > %t.dir/3a.t
+# RUN: ld.lld 3.t --sysroot=%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+# RUN: ld.lld 3i.t --sysroot=%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+
+# RUN: echo 'GROUP("%t.dir/4a.t")INPUT(/libb.a)' > 4.t
+# RUN: echo 'GROUP(AS_NEEDED("a.o"))' > %t.dir/4a.t
+# RUN: not ld.lld 4.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libb.a
+
+# RUN: echo 'INCLUDE "%t.dir/5a.t" INPUT(/libb.a)' > 5.t
+# RUN: echo 'GROUP(a.o)' > %t.dir/5a.t
+# RUN: not ld.lld 5.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libb.a
+
+# CHECK: T _start
+
+# NOLIB: error: {{.*}}unable to find
+
+# RUN: echo 'GROUP("a.o" /libxyz.a )' > a.t
 # RUN: echo 'GROUP("%t/a.o" /libxyz.a )' > %t.dir/xyz.t
-# RUN: not ld.lld -o /dev/null %t.t 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
-# RUN: not ld.lld -o /dev/null %t.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
+# RUN: not ld.lld a.t 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
+# RUN: not ld.lld a.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
 
 ## Since %t.dir/%t does not exist, report an error, instead of falling back to %t
 ## without the syroot prefix.
-# RUN: not ld.lld -o /dev/null %t.dir/xyz.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_FIND_SYSROOT -DTMP=%t/a.o
+# RUN: not ld.lld %t.dir/xyz.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_FIND_SYSROOT -DTMP=%t/a.o
 
 # CANNOT_FIND_SYSROOT:      error: {{.*}}xyz.t:1: cannot find [[TMP]] inside {{.*}}.dir
 # CANNOT_FIND_SYSROOT-NEXT: >>> GROUP({{.*}}
 
-# RUN: echo 'GROUP("2.t")' > 1.t
-# RUN: echo 'GROUP("a.o")' > 2.t
-# RUN: ld.lld 1.t
-# RUN: llvm-readobj a.out > /dev/null
-
-# RUN: echo 'GROUP(AS_NEEDED("a.o"))' > 1.t
-# RUN: ld.lld 1.t
-# RUN: llvm-readobj a.out > /dev/null
-
 # CANNOT_OPEN: error: cannot open [[FILE]]: {{.*}}
 
 #--- a.s
 .globl _start
 _start:
-  ret
+  call b
+
+#--- b.s
+.globl b
+b:
diff --git a/lldb/include/lldb/Symbol/SymbolFile.h b/lldb/include/lldb/Symbol/SymbolFile.h
index d20766788192f7..8419495da73a22 100644
--- a/lldb/include/lldb/Symbol/SymbolFile.h
+++ b/lldb/include/lldb/Symbol/SymbolFile.h
@@ -211,7 +211,15 @@ class SymbolFile : public PluginInterface {
   /// The characteristics of an array type.
   struct ArrayInfo {
     int64_t first_index = 0;
-    llvm::SmallVector<uint64_t, 1> element_orders;
+
+    ///< Each entry belongs to a distinct DW_TAG_subrange_type.
+    ///< For multi-dimensional DW_TAG_array_types we would have
+    ///< an entry for each dimension. An entry represents the
+    ///< optional element count of the subrange.
+    ///
+    ///< The order of entries follows the order of the DW_TAG_subrange_type
+    ///< children of this DW_TAG_array_type.
+    llvm::SmallVector<std::optional<uint64_t>, 1> element_orders;
     uint32_t byte_stride = 0;
     uint32_t bit_stride = 0;
   };
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
index 409e9bb37ab28b..4ed523bbb9e760 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
@@ -37,7 +37,7 @@ DWARFASTParser::ParseChildArrayInfo(const DWARFDIE &parent_die,
     if (attributes.Size() == 0)
       continue;
 
-    uint64_t num_elements = 0;
+    std::optional<uint64_t> num_elements;
     uint64_t lower_bound = 0;
     uint64_t upper_bound = 0;
     bool upper_bound_valid = false;
@@ -91,7 +91,7 @@ DWARFASTParser::ParseChildArrayInfo(const DWARFDIE &parent_die,
       }
     }
 
-    if (num_elements == 0) {
+    if (!num_elements || *num_elements == 0) {
       if (upper_bound_valid && upper_bound >= lower_bound)
         num_elements = upper_bound - lower_bound + 1;
     }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 85c59a605c675c..a4dcde1629fc21 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1395,20 +1395,20 @@ DWARFASTParserClang::ParseArrayType(const DWARFDIE &die,
   uint64_t array_element_bit_stride = byte_stride * 8 + bit_stride;
   CompilerType clang_type;
   if (array_info && array_info->element_orders.size() > 0) {
-    uint64_t num_elements = 0;
     auto end = array_info->element_orders.rend();
     for (auto pos = array_info->element_orders.rbegin(); pos != end; ++pos) {
-      num_elements = *pos;
-      clang_type = m_ast.CreateArrayType(array_element_type, num_elements,
-                                         attrs.is_vector);
+      clang_type = m_ast.CreateArrayType(
+          array_element_type, /*element_count=*/*pos, attrs.is_vector);
+
+      uint64_t num_elements = pos->value_or(0);
       array_element_type = clang_type;
       array_element_bit_stride = num_elements
                                      ? array_element_bit_stride * num_elements
                                      : array_element_bit_stride;
     }
   } else {
-    clang_type =
-        m_ast.CreateArrayType(array_element_type, 0, attrs.is_vector);
+    clang_type = m_ast.CreateArrayType(
+        array_element_type, /*element_count=*/std::nullopt, attrs.is_vector);
   }
   ConstString empty_name;
   TypeSP type_sp =
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 0386e3b261c55c..484ca04fe04c9e 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -2233,30 +2233,31 @@ TypeSystemClang::CreateBlockPointerType(const CompilerType &function_type) {
 
 #pragma mark Array Types
 
-CompilerType TypeSystemClang::CreateArrayType(const CompilerType &element_type,
-                                              size_t element_count,
-                                              bool is_vector) {
-  if (element_type.IsValid()) {
-    ASTContext &ast = getASTContext();
+CompilerType
+TypeSystemClang::CreateArrayType(const CompilerType &element_type,
+                                 std::optional<size_t> element_count,
+                                 bool is_vector) {
+  if (!element_type.IsValid())
+    return {};
 
-    if (is_vector) {
-      return GetType(ast.getExtVectorType(ClangUtil::GetQualType(element_type),
-                                          element_count));
-    } else {
+  ASTContext &ast = getASTContext();
 
-      llvm::APInt ap_element_count(64, element_count);
-      if (element_count == 0) {
-        return GetType(
-            ast.getIncompleteArrayType(ClangUtil::GetQualType(element_type),
-                                       clang::ArraySizeModifier::Normal, 0));
-      } else {
-        return GetType(ast.getConstantArrayType(
-            ClangUtil::GetQualType(element_type), ap_element_count, nullptr,
-            clang::ArraySizeModifier::Normal, 0));
-      }
-    }
-  }
-  return CompilerType();
+  // Unknown number of elements; this is an incomplete array
+  // (e.g., variable length array with non-constant bounds, or
+  // a flexible array member).
+  if (!element_count)
+    return GetType(
+        ast.getIncompleteArrayType(ClangUtil::GetQualType(element_type),
+                                   clang::ArraySizeModifier::Normal, 0));
+
+  if (is_vector)
+    return GetType(ast.getExtVectorType(ClangUtil::GetQualType(element_type),
+                                        *element_count));
+
+  llvm::APInt ap_element_count(64, *element_count);
+  return GetType(ast.getConstantArrayType(ClangUtil::GetQualType(element_type),
+                                          ap_element_count, nullptr,
+                                          clang::ArraySizeModifier::Normal, 0));
 }
 
 CompilerType TypeSystemClang::CreateStructForIdentifier(
@@ -4767,6 +4768,7 @@ TypeSystemClang::GetBitSize(lldb::opaque_compiler_type_t type,
   clang::QualType qual_type(GetCanonicalQualType(type));
   const clang::Type::TypeClass type_class = qual_type->getTypeClass();
   switch (type_class) {
+  case clang::Type::ConstantArray:
   case clang::Type::FunctionProto:
   case clang::Type::Record:
     return getASTContext().getTypeSize(qual_type);
@@ -5457,9 +5459,9 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
   case clang::Type::IncompleteArray:
     if (auto array_info =
             GetDynamicArrayInfo(*this, GetSymbolFile(), qual_type, exe_ctx))
-      // Only 1-dimensional arrays are supported.
+      // FIXME: Only 1-dimensional arrays are supported.
       num_children = array_info->element_orders.size()
-                         ? array_info->element_orders.back()
+                         ? array_info->element_orders.back().value_or(0)
                          : 0;
     break;
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 70722eb375ab76..56a5c0a516706d 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -498,7 +498,8 @@ class TypeSystemClang : public TypeSystem {
   // Array Types
 
   CompilerType CreateArrayType(const CompilerType &element_type,
-                               size_t element_count, bool is_vector);
+                               std::optional<size_t> element_count,
+                               bool is_vector);
 
   // Enumeration Types
   CompilerType CreateEnumerationType(llvm::StringRef name,
diff --git a/lldb/test/API/lang/c/struct_types/main.c b/lldb/test/API/lang/c/struct_types/main.c
index e683c491089762..70217c57bec5f0 100644
--- a/lldb/test/API/lang/c/struct_types/main.c
+++ b/lldb/test/API/lang/c/struct_types/main.c
@@ -1,3 +1,4 @@
+// clang-format off
 struct things_to_sum {
     int a;
     int b;
@@ -18,7 +19,7 @@ int main (int argc, char const *argv[])
     }; //% self.expect("frame variable pt.padding[0]", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["pt.padding[0] = "])
        //% self.expect("frame variable pt.padding[1]", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["pt.padding[1] = "])
        //% self.expect_expr("pt.padding[0]", result_type="char")
-       //% self.expect("image lookup -t point_tag", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ['padding[]'])
+       //% self.expect("image lookup -t point_tag", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ['padding[0]'])
 
     struct {} empty;
     //% self.expect("frame variable empty", substrs = ["empty = {}"])
diff --git a/lldb/test/Shell/SymbolFile/DWARF/vla.cpp b/lldb/test/Shell/SymbolFile/DWARF/vla.cpp
new file mode 100644
index 00000000000000..344b100efd9f96
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/vla.cpp
@@ -0,0 +1,80 @@
+// RUN: %clangxx_host -gdwarf -std=c++11 -o %t %s
+// RUN: %lldb %t \
+// RUN:   -o run \
+// RUN:   -o "frame var --show-types f" \
+// RUN:   -o "frame var vla0" \
+// RUN:   -o "frame var fla0" \
+// RUN:   -o "frame var fla1" \
+// RUN:   -o "frame var vla01" \
+// RUN:   -o "frame var vla10" \
+// RUN:   -o "frame var vlaN" \
+// RUN:   -o "frame var vlaNM" \
+// RUN:   -o exit | FileCheck %s
+
+struct Foo {
+  static constexpr int n = 1;
+  int m_vlaN[n];
+
+  int m_vla0[0];
+};
+
+int main() {
+  Foo f;
+  f.m_vlaN[0] = 60;
+
+  // CHECK:      (lldb) frame var --show-types f
+  // CHECK-NEXT: (Foo) f = {
+  // CHECK-NEXT:   (int[1]) m_vlaN = {
+  // CHECK-NEXT:     (int) [0] = 60
+  // CHECK-NEXT:   }
+  // CHECK-NEXT:   (int[0]) m_vla0 = {}
+  // CHECK-NEXT: }
+
+  int vla0[0] = {};
+
+  // CHECK:      (lldb) frame var vla0
+  // CHECK-NEXT: (int[0]) vla0 = {}
+
+  int fla0[] = {};
+
+  // CHECK:      (lldb) frame var fla0
+  // CHECK-NEXT: (int[0]) fla0 = {}
+
+  int fla1[] = {42};
+
+  // CHECK:      (lldb) frame var fla1
+  // CHECK-NEXT: (int[1]) fla1 = ([0] = 42)
+
+  int vla01[0][1];
+
+  // CHECK:      (lldb) frame var vla01
+  // CHECK-NEXT: (int[0][1]) vla01 = {}
+
+  int vla10[1][0];
+
+  // CHECK:      (lldb) frame var vla10
+  // CHECK-NEXT: (int[1][0]) vla10 = ([0] = int[0]
+
+  int n = 3;
+  int vlaN[n];
+  for (int i = 0; i < n; ++i)
+    vlaN[i] = -i;
+
+  // CHECK:      (lldb) frame var vlaN
+  // CHECK-NEXT: (int[]) vlaN = ([0] = 0, [1] = -1, [2] = -2)
+
+  int m = 2;
+  int vlaNM[n][m];
+  for (int i = 0; i < n; ++i)
+    for (int j = 0; j < m; ++j)
+      vlaNM[i][j] = i + j;
+
+  // FIXME: multi-dimensional VLAs aren't well supported
+  // CHECK:      (lldb) frame var vlaNM
+  // CHECK-NEXT: (int[][]) vlaNM = {
+  // CHECK-NEXT:   [0] = ([0] = 0, [1] = 1, [2] = 1)
+  // CHECK-NEXT:   [1] = ([0] = 1, [1] = 1, [2] = 2)
+  // CHECK-NEXT: }
+
+  __builtin_debugtrap();
+}
diff --git a/llvm/include/llvm/ADT/StableHashing.h b/llvm/include/llvm/ADT/StableHashing.h
index 884b5752d9bb09..f675f828f702e5 100644
--- a/llvm/include/llvm/ADT/StableHashing.h
+++ b/llvm/include/llvm/ADT/StableHashing.h
@@ -95,18 +95,6 @@ inline stable_hash stable_hash_combine_array(const stable_hash *P, size_t C) {
     hashing::detail::stable_hash_append(Hash, P[I]);
   return Hash;
 }
-
-inline stable_hash stable_hash_combine_string(const StringRef &S) {
-  return stable_hash_combine_range(S.begin(), S.end());
-}
-
-inline stable_hash stable_hash_combine_string(const char *C) {
-  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
-  while (*C)
-    hashing::detail::stable_hash_append(Hash, *(C++));
-  return Hash;
-}
-
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 444cef613fb00a..7ea010a345f38f 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -51,13 +51,19 @@ TLI_DEFINE_VECFUNC("llvm.cos.f32", "vcosf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanf", "vtanf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.tan.f32", "vtanf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinf", "vasinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "vasinf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acosf", "vacosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "vacosf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atanf", "vatanf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "vatanf", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Hyperbolic Functions
 TLI_DEFINE_VECFUNC("sinhf", "vsinhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "vsinhf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("coshf", "vcoshf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "vcoshf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanhf", "vtanhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "vtanhf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinhf", "vasinhf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acoshf", "vacoshf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atanhf", "vatanhf", FIXED(4), "_ZGV_LLVM_N4v")
@@ -1358,9 +1364,17 @@ TLI_DEFINE_VECFUNC("asinf", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinf", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("asinf", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "amd_vrd8_asin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
 TLI_DEFINE_VECFUNC("acosf", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acosf", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
 TLI_DEFINE_VECFUNC("atan", "amd_vrd2_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("atan", "amd_vrd4_atan", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atan", "amd_vrd8_atan", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
@@ -1368,12 +1382,25 @@ TLI_DEFINE_VECFUNC("atanf", "amd_vrs4_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atanf", "amd_vrs8_atanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("atanf", "amd_vrs16_atanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "amd_vrd2_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "amd_vrd4_atan", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "amd_vrd8_atan", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "amd_vrs4_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "amd_vrs8_atanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "amd_vrs16_atanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
 TLI_DEFINE_VECFUNC("coshf", "amd_vrs4_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("coshf", "amd_vrs8_coshf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "amd_vrs4_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "amd_vrs8_coshf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+
 TLI_DEFINE_VECFUNC("tanhf", "amd_vrs4_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanhf", "amd_vrs8_tanhf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs4_tanhf",  FIXED(4),  NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs8_tanhf",  FIXED(8),  NOMASK, "_ZGV_LLVM_N8v")
+
 TLI_DEFINE_VECFUNC("cbrt", "amd_vrd2_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 
diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index 662c21ee6872e8..9a62ad396127db 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -140,6 +140,8 @@ class DebugHandlerBase {
   void beginBasicBlockSection(const MachineBasicBlock &MBB);
   void endBasicBlockSection(const MachineBasicBlock &MBB);
 
+  virtual void beginCodeAlignment(const MachineBasicBlock &MBB) {}
+
   /// Return Label preceding the instruction.
   MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
 
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index d9e27e087e705b..9c70a60bf50c77 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1616,7 +1616,8 @@ m_FCmp(const LHS &L, const RHS &R) {
 
 // Same as CmpClass, but instead of saving Pred as out output variable, match a
 // specific input pred for equality.
-template <typename LHS_t, typename RHS_t, typename Class, typename PredicateTy>
+template <typename LHS_t, typename RHS_t, typename Class, typename PredicateTy,
+          bool Commutable = false>
 struct SpecificCmpClass_match {
   const PredicateTy Predicate;
   LHS_t L;
@@ -1626,9 +1627,17 @@ struct SpecificCmpClass_match {
       : Predicate(Pred), L(LHS), R(RHS) {}
 
   template <typename OpTy> bool match(OpTy *V) {
-    if (auto *I = dyn_cast<Class>(V))
-      return I->getPredicate() == Predicate && L.match(I->getOperand(0)) &&
-             R.match(I->getOperand(1));
+    if (auto *I = dyn_cast<Class>(V)) {
+      if (I->getPredicate() == Predicate && L.match(I->getOperand(0)) &&
+          R.match(I->getOperand(1)))
+        return true;
+      if constexpr (Commutable) {
+        if (I->getPredicate() == Class::getSwappedPredicate(Predicate) &&
+            L.match(I->getOperand(1)) && R.match(I->getOperand(0)))
+          return true;
+      }
+    }
+
     return false;
   }
 };
@@ -1647,6 +1656,13 @@ m_SpecificICmp(ICmpInst::Predicate MatchPred, const LHS &L, const RHS &R) {
       MatchPred, L, R);
 }
 
+template <typename LHS, typename RHS>
+inline SpecificCmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>
+m_c_SpecificICmp(ICmpInst::Predicate MatchPred, const LHS &L, const RHS &R) {
+  return SpecificCmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>(
+      MatchPred, L, R);
+}
+
 template <typename LHS, typename RHS>
 inline SpecificCmpClass_match<LHS, RHS, FCmpInst, FCmpInst::Predicate>
 m_SpecificFCmp(FCmpInst::Predicate MatchPred, const LHS &L, const RHS &R) {
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index ee93cb94100ec4..84f8e060320a1b 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -38,7 +38,6 @@ class MCFragment {
   enum FragmentType : uint8_t {
     FT_Align,
     FT_Data,
-    FT_CompactEncodedInst,
     FT_Fill,
     FT_Nops,
     FT_Relaxable,
@@ -136,7 +135,6 @@ class MCEncodedFragment : public MCFragment {
     default:
       return false;
     case MCFragment::FT_Relaxable:
-    case MCFragment::FT_CompactEncodedInst:
     case MCFragment::FT_Data:
     case MCFragment::FT_Dwarf:
     case MCFragment::FT_DwarfFrame:
@@ -172,29 +170,12 @@ class MCEncodedFragment : public MCFragment {
   }
 };
 
-/// Interface implemented by fragments that contain encoded instructions and/or
-/// data.
-///
-template<unsigned ContentsSize>
-class MCEncodedFragmentWithContents : public MCEncodedFragment {
-  SmallVector<char, ContentsSize> Contents;
-
-protected:
-  MCEncodedFragmentWithContents(MCFragment::FragmentType FType,
-                                bool HasInstructions)
-      : MCEncodedFragment(FType, HasInstructions) {}
-
-public:
-  SmallVectorImpl<char> &getContents() { return Contents; }
-  const SmallVectorImpl<char> &getContents() const { return Contents; }
-};
-
 /// Interface implemented by fragments that contain encoded instructions and/or
 /// data and also have fixups registered.
 ///
-template<unsigned ContentsSize, unsigned FixupsSize>
-class MCEncodedFragmentWithFixups :
-  public MCEncodedFragmentWithContents<ContentsSize> {
+template <unsigned ContentsSize, unsigned FixupsSize>
+class MCEncodedFragmentWithFixups : public MCEncodedFragment {
+  SmallVector<char, ContentsSize> Contents;
 
   /// The list of fixups in this fragment.
   SmallVector<MCFixup, FixupsSize> Fixups;
@@ -202,13 +183,16 @@ class MCEncodedFragmentWithFixups :
 protected:
   MCEncodedFragmentWithFixups(MCFragment::FragmentType FType,
                               bool HasInstructions)
-      : MCEncodedFragmentWithContents<ContentsSize>(FType, HasInstructions) {}
+      : MCEncodedFragment(FType, HasInstructions) {}
 
 public:
 
   using const_fixup_iterator = SmallVectorImpl<MCFixup>::const_iterator;
   using fixup_iterator = SmallVectorImpl<MCFixup>::iterator;
 
+  SmallVectorImpl<char> &getContents() { return Contents; }
+  const SmallVectorImpl<char> &getContents() const { return Contents; }
+
   SmallVectorImpl<MCFixup> &getFixups() { return Fixups; }
   const SmallVectorImpl<MCFixup> &getFixups() const { return Fixups; }
 
@@ -240,21 +224,6 @@ class MCDataFragment : public MCEncodedFragmentWithFixups<32, 4> {
   void setLinkerRelaxable() { LinkerRelaxable = true; }
 };
 
-/// This is a compact (memory-size-wise) fragment for holding an encoded
-/// instruction (non-relaxable) that has no fixups registered. When applicable,
-/// it can be used instead of MCDataFragment and lead to lower memory
-/// consumption.
-///
-class MCCompactEncodedInstFragment : public MCEncodedFragmentWithContents<4> {
-public:
-  MCCompactEncodedInstFragment()
-      : MCEncodedFragmentWithContents(FT_CompactEncodedInst, true) {}
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_CompactEncodedInst;
-  }
-};
-
 /// A relaxable fragment holds on to its MCInst, since it may need to be
 /// relaxed during the assembler layout and relaxation stage.
 ///
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 1397d9da706436..667aeba1bda1f0 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -65,8 +65,8 @@
 #define LLVM_SANDBOXIR_SANDBOXIR_H
 
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/SandboxIR/Tracker.h"
@@ -772,10 +772,10 @@ class LoadInst final : public Instruction {
   unsigned getNumOfIRInstrs() const final { return 1u; }
   static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
                           Instruction *InsertBefore, Context &Ctx,
-                          const Twine &Name = "");
+                          bool IsVolatile = false, const Twine &Name = "");
   static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
                           BasicBlock *InsertAtEnd, Context &Ctx,
-                          const Twine &Name = "");
+                          bool IsVolatile = false, const Twine &Name = "");
   /// For isa/dyn_cast.
   static bool classof(const Value *From);
   Value *getPointerOperand() const;
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 3a7ae577bb068a..12a3193e637552 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -88,7 +88,7 @@ static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
   else
     return nullptr;
 
-  CmpInst::Predicate ExpectedPred, Pred1, Pred2;
+  CmpInst::Predicate ExpectedPred;
   if (BinOpCode == BinaryOperator::Or) {
     ExpectedPred = ICmpInst::ICMP_NE;
   } else if (BinOpCode == BinaryOperator::And) {
@@ -110,10 +110,10 @@ static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
   // -->
   // %TV
   Value *X, *Y;
-  if (!match(Cond, m_c_BinOp(m_c_ICmp(Pred1, m_Specific(TrueVal),
-                                      m_Specific(FalseVal)),
-                             m_ICmp(Pred2, m_Value(X), m_Value(Y)))) ||
-      Pred1 != Pred2 || Pred1 != ExpectedPred)
+  if (!match(Cond,
+             m_c_BinOp(m_c_SpecificICmp(ExpectedPred, m_Specific(TrueVal),
+                                        m_Specific(FalseVal)),
+                       m_SpecificICmp(ExpectedPred, m_Value(X), m_Value(Y)))))
     return nullptr;
 
   if (X == TrueVal || X == FalseVal || Y == TrueVal || Y == FalseVal)
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 922d2f586cc814..9ad212dc3f142b 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -254,8 +254,7 @@ bool llvm::haveNoCommonBitsSet(const WithCache<const Value *> &LHSCache,
 
 bool llvm::isOnlyUsedInZeroComparison(const Instruction *I) {
   return !I->user_empty() && all_of(I->users(), [](const User *U) {
-    ICmpInst::Predicate P;
-    return match(U, m_ICmp(P, m_Value(), m_Zero()));
+    return match(U, m_ICmp(m_Value(), m_Zero()));
   });
 }
 
@@ -2588,10 +2587,10 @@ static bool isNonZeroRecurrence(const PHINode *PN) {
 }
 
 static bool matchOpWithOpEqZero(Value *Op0, Value *Op1) {
-  ICmpInst::Predicate Pred;
-  return (match(Op0, m_ZExtOrSExt(m_ICmp(Pred, m_Specific(Op1), m_Zero()))) ||
-          match(Op1, m_ZExtOrSExt(m_ICmp(Pred, m_Specific(Op0), m_Zero())))) &&
-         Pred == ICmpInst::ICMP_EQ;
+  return match(Op0, m_ZExtOrSExt(m_SpecificICmp(ICmpInst::ICMP_EQ,
+                                                m_Specific(Op1), m_Zero()))) ||
+         match(Op1, m_ZExtOrSExt(m_SpecificICmp(ICmpInst::ICMP_EQ,
+                                                m_Specific(Op0), m_Zero())));
 }
 
 static bool isNonZeroAdd(const APInt &DemandedElts, unsigned Depth,
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 9e983ec79e7bbb..dc15d86fa47da7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -4016,6 +4016,9 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
     CurrentSectionBeginSym = MBB.getSymbol();
   }
 
+  for (auto &Handler : DebugHandlers)
+    Handler->beginCodeAlignment(MBB);
+
   // Emit an alignment directive for this block, if needed.
   const Align Alignment = MBB.getAlignment();
   if (Alignment != Align(1))
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 14347b07138d17..4722512b06e29b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3831,3 +3831,21 @@ bool DwarfDebug::alwaysUseRanges(const DwarfCompileUnit &CU) const {
     return true;
   return false;
 }
+
+void DwarfDebug::beginCodeAlignment(const MachineBasicBlock &MBB) {
+  if (MBB.getAlignment() == Align(1))
+    return;
+
+  auto *SP = MBB.getParent()->getFunction().getSubprogram();
+  bool NoDebug =
+      !SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug;
+
+  if (NoDebug)
+    return;
+
+  auto PrevLoc = Asm->OutStreamer->getContext().getCurrentDwarfLoc();
+  Asm->OutStreamer->emitDwarfLocDirective(
+      PrevLoc.getFileNum(), 0, PrevLoc.getColumn(), 0, 0, 0, StringRef());
+  MCDwarfLineEntry::make(Asm->OutStreamer.get(),
+                         Asm->OutStreamer->getCurrentSectionOnly());
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 1185a9331b11a8..7873e8163d79c5 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -763,6 +763,9 @@ class DwarfDebug : public DebugHandlerBase {
   /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 
+  /// Process beginning of code alignment.
+  void beginCodeAlignment(const MachineBasicBlock &MBB) override;
+
   /// Perform an MD5 checksum of \p Identifier and return the lower 64 bits.
   static uint64_t makeTypeSignature(StringRef Identifier);
 
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index 0a6ce6a1358170..20d5b2602df12c 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -2426,9 +2426,7 @@ bool InstrRefBasedLDV::mlocJoin(
   // as its predecessors. If a PHI is placed, test to see whether it's now a
   // redundant PHI that we can eliminate.
 
-  SmallVector<const MachineBasicBlock *, 8> BlockOrders;
-  for (auto *Pred : MBB.predecessors())
-    BlockOrders.push_back(Pred);
+  SmallVector<const MachineBasicBlock *, 8> BlockOrders(MBB.predecessors());
 
   // Visit predecessors in RPOT order.
   auto Cmp = [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
@@ -3268,9 +3266,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
         bool InLocsChanged =
             vlocJoin(*MBB, LiveOutIdx, BlocksToExplore, *LiveIn);
 
-        SmallVector<const MachineBasicBlock *, 8> Preds;
-        for (const auto *Pred : MBB->predecessors())
-          Preds.push_back(Pred);
+        SmallVector<const MachineBasicBlock *, 8> Preds(MBB->predecessors());
 
         // If this block's live-in value is a VPHI, try to pick a machine-value
         // for it. This makes the machine-value available and propagated
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 5abfbd5981fba8..d2e02a2d739c1b 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/xxhash.h"
 
 #define DEBUG_TYPE "machine-stable-hash"
 
@@ -100,8 +101,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_TargetIndex: {
     if (const char *Name = MO.getTargetIndexName())
       return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
-                                 stable_hash_combine_string(Name),
-                                 MO.getOffset());
+                                 xxh3_64bits(Name), MO.getOffset());
     StableHashBailingTargetIndexNoName++;
     return 0;
   }
@@ -113,7 +113,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
 
   case MachineOperand::MO_ExternalSymbol:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
-                        stable_hash_combine_string(MO.getSymbolName()));
+                        xxh3_64bits(MO.getSymbolName()));
 
   case MachineOperand::MO_RegisterMask:
   case MachineOperand::MO_RegisterLiveOut: {
@@ -151,7 +151,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_MCSymbol: {
     auto SymbolName = MO.getMCSymbol()->getName();
     return hash_combine(MO.getType(), MO.getTargetFlags(),
-                        stable_hash_combine_string(SymbolName));
+                        xxh3_64bits(SymbolName));
   }
   case MachineOperand::MO_CFIIndex:
     return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index fda5a28e86a8b3..cd114c8ed7bfc2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2483,6 +2483,11 @@ Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
     Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
     if (RedAlign2 < RedAlign)
       RedAlign = RedAlign2;
+
+    if (!getMachineFunction().getFrameInfo().isStackRealignable())
+      // If the stack is not realignable, the alignment should be limited to the
+      // StackAlignment
+      RedAlign = std::min(RedAlign, StackAlign);
   }
 
   return RedAlign;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c215b91af0a167..a2c0b0524e2e69 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1246,9 +1246,7 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
       SmallVector<Value *> Values(It->Values.location_ops());
       if (!handleDebugValue(Values, Var, It->Expr, It->DL, SDNodeOrder,
                             It->Values.hasArgList())) {
-        SmallVector<Value *, 4> Vals;
-        for (Value *V : It->Values.location_ops())
-          Vals.push_back(V);
+        SmallVector<Value *, 4> Vals(It->Values.location_ops());
         addDanglingDebugInfo(Vals,
                              FnVarLocs->getDILocalVariable(It->VariableID),
                              It->Expr, Vals.size() > 1, It->DL, SDNodeOrder);
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 869b383dd064f2..0d7a51bfe73753 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -260,9 +260,7 @@ void MCJIT::finalizeObject() {
 
   // Generate code for module is going to move objects out of the 'added' list,
   // so we need to copy that out before using it:
-  SmallVector<Module*, 16> ModsToAdd;
-  for (auto *M : OwnedModules.added())
-    ModsToAdd.push_back(M);
+  SmallVector<Module *, 16> ModsToAdd(OwnedModules.added());
 
   for (auto *M : ModsToAdd)
     generateCodeForModule(M);
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 791bd8f0e32ef4..2a91a18299c281 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -626,9 +626,7 @@ BasicBlock *BasicBlock::splitBasicBlockBefore(iterator I, const Twine &BBName) {
   // to reflect that the incoming branches will be from the New block and not
   // from predecessors of the 'this' block.
   // Save predecessors to separate vector before modifying them.
-  SmallVector<BasicBlock *, 4> Predecessors;
-  for (BasicBlock *Pred : predecessors(this))
-    Predecessors.push_back(Pred);
+  SmallVector<BasicBlock *, 4> Predecessors(predecessors(this));
   for (BasicBlock *Pred : Predecessors) {
     Instruction *TI = Pred->getTerminator();
     TI->replaceSuccessorWith(this, New);
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index ceeb7af0fecc43..f6a3182a7868ce 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -56,8 +56,6 @@ STATISTIC(EmittedRelaxableFragments,
           "Number of emitted assembler fragments - relaxable");
 STATISTIC(EmittedDataFragments,
           "Number of emitted assembler fragments - data");
-STATISTIC(EmittedCompactEncodedInstFragments,
-          "Number of emitted assembler fragments - compact encoded inst");
 STATISTIC(EmittedAlignFragments,
           "Number of emitted assembler fragments - align");
 STATISTIC(EmittedFillFragments,
@@ -253,8 +251,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
     return cast<MCDataFragment>(F).getContents().size();
   case MCFragment::FT_Relaxable:
     return cast<MCRelaxableFragment>(F).getContents().size();
-  case MCFragment::FT_CompactEncodedInst:
-    return cast<MCCompactEncodedInstFragment>(F).getContents().size();
   case MCFragment::FT_Fill: {
     auto &FF = cast<MCFillFragment>(F);
     int64_t NumValues = 0;
@@ -662,11 +658,6 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     OS << cast<MCRelaxableFragment>(F).getContents();
     break;
 
-  case MCFragment::FT_CompactEncodedInst:
-    ++stats::EmittedCompactEncodedInstFragments;
-    OS << cast<MCCompactEncodedInstFragment>(F).getContents();
-    break;
-
   case MCFragment::FT_Fill: {
     ++stats::EmittedFillFragments;
     const MCFillFragment &FF = cast<MCFillFragment>(F);
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index b1012507a1242f..3086730b1be8ab 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -37,9 +37,6 @@ void MCFragment::destroy() {
     case FT_Data:
       cast<MCDataFragment>(this)->~MCDataFragment();
       return;
-    case FT_CompactEncodedInst:
-      cast<MCCompactEncodedInstFragment>(this)->~MCCompactEncodedInstFragment();
-      return;
     case FT_Fill:
       cast<MCFillFragment>(this)->~MCFillFragment();
       return;
@@ -107,8 +104,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
-  case MCFragment::FT_CompactEncodedInst:
-    OS << "MCCompactEncodedInstFragment"; break;
   case MCFragment::FT_Fill:  OS << "MCFillFragment"; break;
   case MCFragment::FT_Nops:
     OS << "MCFNopsFragment";
@@ -168,19 +163,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     }
     break;
   }
-  case MCFragment::FT_CompactEncodedInst: {
-    const auto *CEIF =
-      cast<MCCompactEncodedInstFragment>(this);
-    OS << "\n       ";
-    OS << " Contents:[";
-    const SmallVectorImpl<char> &Contents = CEIF->getContents();
-    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
-      if (i) OS << ",";
-      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
-    }
-    OS << "] (" << Contents.size() << " bytes)";
-    break;
-  }
   case MCFragment::FT_Fill:  {
     const auto *FF = cast<MCFillFragment>(this);
     OS << " Value:" << static_cast<unsigned>(FF->getValue())
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 2dc9f5864dc5cc..d2b4fb207ba3ac 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -612,23 +612,23 @@ void BranchInst::dump() const {
 
 LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
                            Instruction *InsertBefore, Context &Ctx,
-                           const Twine &Name) {
+                           bool IsVolatile, const Twine &Name) {
   llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
   auto &Builder = Ctx.getLLVMIRBuilder();
   Builder.SetInsertPoint(BeforeIR);
-  auto *NewLI = Builder.CreateAlignedLoad(Ty, Ptr->Val, Align,
-                                          /*isVolatile=*/false, Name);
+  auto *NewLI =
+      Builder.CreateAlignedLoad(Ty, Ptr->Val, Align, IsVolatile, Name);
   auto *NewSBI = Ctx.createLoadInst(NewLI);
   return NewSBI;
 }
 
 LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
                            BasicBlock *InsertAtEnd, Context &Ctx,
-                           const Twine &Name) {
+                           bool IsVolatile, const Twine &Name) {
   auto &Builder = Ctx.getLLVMIRBuilder();
   Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
-  auto *NewLI = Builder.CreateAlignedLoad(Ty, Ptr->Val, Align,
-                                          /*isVolatile=*/false, Name);
+  auto *NewLI =
+      Builder.CreateAlignedLoad(Ty, Ptr->Val, Align, IsVolatile, Name);
   auto *NewSBI = Ctx.createLoadInst(NewLI);
   return NewSBI;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index e91d05954a1c99..e724c978c44b61 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -225,10 +225,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
 
       for (User *ICmp : BlockCount->users()) {
-        ICmpInst::Predicate Pred;
-        if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
-          if (Pred != ICmpInst::ICMP_ULT)
-            continue;
+        if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
+                                       m_Specific(BlockCount)))) {
           ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
           MadeChange = true;
         }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index c31f85dbea127f..a75391fcfa8a4e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -119,20 +119,6 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  if (int64_t Offset = MI->getOperand(OpNo).getImm())
-    O << " offset0:" << formatDec(Offset);
-}
-
-void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  if (int64_t Offset = MI->getOperand(OpNo).getImm())
-    O << " offset1:" << formatDec(Offset);
-}
-
 void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
@@ -145,13 +131,6 @@ void AMDGPUInstPrinter::printSMEMOffset(const MCInst *MI, unsigned OpNo,
   O << formatHex(MI->getOperand(OpNo).getImm());
 }
 
-void AMDGPUInstPrinter::printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
-                                           const MCSubtargetInfo &STI,
-                                           raw_ostream &O) {
-  O << " offset:";
-  printSMEMOffset(MI, OpNo, STI, O);
-}
-
 void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
                                                const MCSubtargetInfo &STI,
                                                raw_ostream &O) {
@@ -269,14 +248,6 @@ void AMDGPUInstPrinter::printScope(int64_t Scope, raw_ostream &O) {
   return;
 }
 
-void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
-                                   const MCSubtargetInfo &STI, raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " dmask:";
-    printU16ImmOperand(MI, OpNo, STI, O);
-  }
-}
-
 void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo,
                                  const MCSubtargetInfo &STI, raw_ostream &O) {
   unsigned Dim = MI->getOperand(OpNo).getImm();
@@ -678,26 +649,6 @@ void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo,
   O << " blgp:" << Imm;
 }
 
-void AMDGPUInstPrinter::printCBSZ(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &O) {
-  unsigned Imm = MI->getOperand(OpNo).getImm();
-  if (!Imm)
-    return;
-
-  O << " cbsz:" << Imm;
-}
-
-void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &O) {
-  unsigned Imm = MI->getOperand(OpNo).getImm();
-  if (!Imm)
-    return;
-
-  O << " abid:" << Imm;
-}
-
 void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
                                                const MCSubtargetInfo &STI,
                                                raw_ostream &O) {
@@ -711,30 +662,6 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
     O << ", ";
 }
 
-void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
-                                      const MCSubtargetInfo &STI,
-                                      raw_ostream &O) {
-  O << " wait_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
-}
-
-void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  O << " wait_va_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
-}
-
-void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  O << " wait_vm_vsrc:" << formatDec(MI->getOperand(OpNo).getImm());
-}
-
-void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
-                                    const MCSubtargetInfo &STI,
-                                    raw_ostream &O) {
-  O << " wait_exp:" << formatDec(MI->getOperand(OpNo).getImm());
-}
-
 bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
                                         unsigned OpNo) const {
   return OpNo == 0 && (Desc.TSFlags & SIInstrFlags::DPP) &&
@@ -1127,18 +1054,6 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printDppRowMask(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  O << " row_mask:" << formatHex(MI->getOperand(OpNo).getImm());
-}
-
-void AMDGPUInstPrinter::printDppBankMask(const MCInst *MI, unsigned OpNo,
-                                         const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
-  O << " bank_mask:" << formatHex(MI->getOperand(OpNo).getImm());
-}
-
 void AMDGPUInstPrinter::printDppBoundCtrl(const MCInst *MI, unsigned OpNo,
                                           const MCSubtargetInfo &STI,
                                           raw_ostream &O) {
@@ -1782,14 +1697,13 @@ void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
   O << ' ' << formatDec(Imm);
 }
 
-void AMDGPUInstPrinter::printByteSel(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  uint8_t Imm = MI->getOperand(OpNo).getImm();
-  if (!Imm)
-    return;
-
-  O << " byte_sel:" << formatDec(Imm);
+void AMDGPUInstPrinter::printNamedInt(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O, StringRef Prefix,
+                                      bool PrintInHex, bool AlwaysPrint) {
+  int64_t V = MI->getOperand(OpNo).getImm();
+  if (AlwaysPrint || V != 0)
+    O << ' ' << Prefix << ':' << (PrintInHex ? formatHex(V) : formatDec(V));
 }
 
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 4a39022aea7cfd..4d44db5d9d818c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -48,24 +48,16 @@ class AMDGPUInstPrinter : public MCInstPrinter {
   void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                        raw_ostream &O);
 
-  void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
-  void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
   void printSMRDOffset8(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printSMEMOffset(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
-  void printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
-                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printCPol(const MCInst *MI, unsigned OpNo,
                  const MCSubtargetInfo &STI, raw_ostream &O);
   void printTH(const MCInst *MI, int64_t TH, int64_t Scope, raw_ostream &O);
   void printScope(int64_t Scope, raw_ostream &O);
-  void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                  raw_ostream &O);
   void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                 raw_ostream &O);
   void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -110,10 +102,6 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                  raw_ostream &O);
   void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
-  void printDppRowMask(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
-  void printDppBankMask(const MCInst *MI, unsigned OpNo,
-                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printDppBoundCtrl(const MCInst *MI, unsigned OpNo,
                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printDppFI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -154,21 +142,9 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printBLGP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                  raw_ostream &O);
-  void printCBSZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                 raw_ostream &O);
-  void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                 raw_ostream &O);
   bool needsImpliedVcc(const MCInstrDesc &Desc, unsigned OpNo) const;
   void printDefaultVccOperand(bool FirstOperand, const MCSubtargetInfo &STI,
                               raw_ostream &O);
-  void printWaitVDST(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
-  void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
-  void printWaitVAVDst(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
-  void printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O, unsigned N);
@@ -182,8 +158,9 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpTgt(const MCInst *MI, unsigned OpNo,
                    const MCSubtargetInfo &STI, raw_ostream &O);
-  void printByteSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
+  void printNamedInt(const MCInst *MI, unsigned OpNo,
+                     const MCSubtargetInfo &STI, raw_ostream &O,
+                     StringRef Prefix, bool PrintInHex, bool AlwaysPrint);
 
 public:
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 00a8b159c95262..e8a67ab6f7e94c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1015,18 +1015,29 @@ def SDWAVopcDst : BoolRC {
   let PrintMethod = "printVOPDst";
 }
 
-class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
+class NamedIntOperand<ValueType Type, string prefix, bit Optional = 1,
                       string name = NAME>
     : CustomOperand<Type, Optional, name> {
+  string Prefix = prefix;
+
   let PredicateMethod =
     "getPredicate([](const AMDGPUOperand &Op) -> bool { "#
     "return Op.isImmTy(AMDGPUOperand::"#ImmTy#"); })";
+
   string Validator = "[](int64_t V) { return true; }";
   string ConvertMethod = "[](int64_t &V) { return "#Validator#"(V); }";
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
     "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
     "AMDGPUOperand::"#ImmTy#", "#ConvertMethod#"); }";
+
+  bit PrintInHex = 0;
+  bit AlwaysPrint = 0;
+  let PrintMethod = "[this](const MCInst *MI, unsigned OpNo, "
+                    "const MCSubtargetInfo &STI, raw_ostream &O) { "
+                    "printNamedInt(MI, OpNo, STI, O, \""#Prefix#"\", "#
+                    !if(PrintInHex, "true", "false")#", "#
+                    !if(AlwaysPrint, "true", "false")#"); }";
 }
 
 class NamedBitOperand<string Id, string Name = NAME>
@@ -1065,6 +1076,7 @@ class ArrayOperand0<string Id, string Name = NAME>
 
 let ImmTy = "ImmTyOffset" in
 def flat_offset : CustomOperand<i32, 1, "FlatOffset">;
+let PrintMethod = "printOffset" in
 def Offset : NamedIntOperand<i32, "offset">;
 let Validator = "isUInt<8>" in {
 def Offset0 : NamedIntOperand<i32, "offset0">;
@@ -1103,6 +1115,7 @@ def exp_vm : NamedBitOperand<"vm", "ExpVM">;
 
 def FORMAT : CustomOperand<i8>;
 
+let PrintInHex = 1 in
 def DMask : NamedIntOperand<i16, "dmask">;
 
 def Dim : CustomOperand<i8, /*optional=*/1>;
@@ -1123,16 +1136,18 @@ def IndexKey8bit : CustomOperand<i32, 1>;
 def dpp8 : CustomOperand<i32, 0, "DPP8">;
 def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
 
-let DefaultValue = "0xf" in {
+let DefaultValue = "0xf", PrintInHex = 1, AlwaysPrint = 1 in {
 def DppRowMask : NamedIntOperand<i32, "row_mask">;
 def DppBankMask : NamedIntOperand<i32, "bank_mask">;
 }
 def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl"> {
   let ConvertMethod = "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }";
+  let PrintMethod = "printDppBoundCtrl";
 }
 
-let DecoderMethod = "decodeDpp8FI" in
+let DecoderMethod = "decodeDpp8FI", PrintMethod = "printDppFI" in
 def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
+let PrintMethod = "printDppFI" in
 def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 
 def blgp : CustomOperand<i32, 1, "BLGP">;
@@ -1146,6 +1161,7 @@ def hwreg : CustomOperand<i32, 0, "Hwreg">;
 
 def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
 
+let AlwaysPrint = 1 in {
 def WaitVDST : NamedIntOperand<i8, "wait_vdst"> {
   let Validator = "isUInt<4>";
 }
@@ -1158,6 +1174,7 @@ def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst"> {
 def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc"> {
   let Validator = "isUInt<1>";
 }
+} // End AlwaysPrint = 1
 
 def ByteSel : NamedIntOperand<i8, "byte_sel"> {
   let Validator = "isUInt<2>";
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 3d6e8efc905d57..8cc963a6c1bb56 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -11,7 +11,10 @@ def smrd_offset_8 : ImmOperand<i32, "SMRDOffset8", 1>;
 let EncoderMethod = "getSMEMOffsetEncoding",
     DecoderMethod = "decodeSMEMOffset" in {
 def SMEMOffset : ImmOperand<i32, "SMEMOffset", 1>;
-def SMEMOffsetMod : NamedIntOperand<i32, "offset", 0>;
+def SMEMOffsetMod : NamedIntOperand<i32, "offset", 0> {
+  let AlwaysPrint = 1;
+  let PrintInHex = 1;
+}
 def OptSMEMOffsetMod : NamedIntOperand<i32, "offset"> {
   let ImmTy = SMEMOffsetMod.ImmTy;
   let PredicateMethod = SMEMOffsetMod.PredicateMethod;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index b5ca045058cbc7..e940dce7faa144 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1963,7 +1963,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
         LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
       return LT.first * ST->getMVEVectorCostFactor(CostKind);
 
-    // Otherwise we use a legal convert followed by a min+max
+    // If we can we use a legal convert followed by a min+max
     if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
          (ST->hasFP64() && LT.second == MVT::f64) ||
          (ST->hasFullFP16() && LT.second == MVT::f16) ||
@@ -1984,7 +1984,25 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
       return LT.first * Cost;
     }
-    break;
+    // Otherwise we need to follow the default expansion that clamps the value
+    // using a float min/max with a fcmp+sel for nan handling when signed.
+    Type *FPTy = ICA.getArgTypes()[0];
+    Type *RetTy = ICA.getReturnType();
+    IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
+    InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);
+    IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
+    Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+    Cost +=
+        getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
+                         RetTy, FPTy, TTI::CastContextHint::None, CostKind);
+    if (IsSigned) {
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+      Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+    }
+    return Cost;
   }
   }
 
diff --git a/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp b/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
index 00931b1f63b29a..e43437d00e91d4 100644
--- a/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
+++ b/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
@@ -123,9 +123,7 @@ bool PPCGenScalarMASSEntries::runOnModule(Module &M) {
     // The call to createScalarMASSCall() invalidates the iterator over users
     // upon replacing the users. Precomputing the current list of users allows
     // us to replace all the call sites.
-    SmallVector<User *, 4> TheUsers;
-    for (auto *User : Func.users())
-      TheUsers.push_back(User);
+    SmallVector<User *, 4> TheUsers(Func.users());
 
     for (auto *User : TheUsers)
       if (auto *CI = dyn_cast_or_null<CallInst>(User)) {
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index e676c2f94583d1..7abd5a49a1b5fc 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1535,6 +1535,7 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   RISCVMachineFunctionInfo *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const RISCVRegisterInfo &TRI = *STI.getRegisterInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, MFI.getCalleeSavedInfo());
@@ -1554,12 +1555,22 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
     // Insert the spill to the stack frame.
     int FI = CS.getFrameIdx();
     if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::ScalableVector) {
-      unsigned CFIIndex = MF->addFrameInst(
-          createDefCFAOffset(*STI.getRegisterInfo(), CS.getReg(), -FixedSize,
-                             MFI.getObjectOffset(FI) / 8));
-      BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlag(MachineInstr::FrameSetup);
+      MCRegister BaseReg = TRI.getSubReg(CS.getReg(), RISCV::sub_vrm1_0);
+      // If it's not a grouped vector register, it doesn't have subregister, so
+      // the base register is just itself.
+      if (BaseReg == RISCV::NoRegister)
+        BaseReg = CS.getReg();
+      unsigned NumRegs = RISCV::VRRegClass.contains(CS.getReg())     ? 1
+                         : RISCV::VRM2RegClass.contains(CS.getReg()) ? 2
+                         : RISCV::VRM4RegClass.contains(CS.getReg()) ? 4
+                                                                     : 8;
+      for (unsigned i = 0; i < NumRegs; ++i) {
+        unsigned CFIIndex = MF->addFrameInst(createDefCFAOffset(
+            TRI, BaseReg + i, -FixedSize, MFI.getObjectOffset(FI) / 8 + i));
+        BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
     }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index de9060699e5577..9ce669a3122f5f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21181,37 +21181,37 @@ bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned)
 bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
                                                  SDValue C) const {
   // Check integral scalar types.
-  const bool HasZmmul = Subtarget.hasStdExtZmmul();
   if (!VT.isScalarInteger())
     return false;
 
   // Omit the optimization if the sub target has the M extension and the data
   // size exceeds XLen.
+  const bool HasZmmul = Subtarget.hasStdExtZmmul();
   if (HasZmmul && VT.getSizeInBits() > Subtarget.getXLen())
     return false;
 
-  if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
-    // Break the MUL to a SLLI and an ADD/SUB.
-    const APInt &Imm = ConstNode->getAPIntValue();
-    if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
-        (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
-      return true;
+  auto *ConstNode = cast<ConstantSDNode>(C);
+  const APInt &Imm = ConstNode->getAPIntValue();
 
-    // Optimize the MUL to (SH*ADD x, (SLLI x, bits)) if Imm is not simm12.
-    if (Subtarget.hasStdExtZba() && !Imm.isSignedIntN(12) &&
-        ((Imm - 2).isPowerOf2() || (Imm - 4).isPowerOf2() ||
-         (Imm - 8).isPowerOf2()))
-      return true;
+  // Break the MUL to a SLLI and an ADD/SUB.
+  if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
+      (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
+    return true;
 
-    // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
-    // a pair of LUI/ADDI.
-    if (!Imm.isSignedIntN(12) && Imm.countr_zero() < 12 &&
-        ConstNode->hasOneUse()) {
-      APInt ImmS = Imm.ashr(Imm.countr_zero());
-      if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2() ||
-          (1 - ImmS).isPowerOf2())
-        return true;
-    }
+  // Optimize the MUL to (SH*ADD x, (SLLI x, bits)) if Imm is not simm12.
+  if (Subtarget.hasStdExtZba() && !Imm.isSignedIntN(12) &&
+      ((Imm - 2).isPowerOf2() || (Imm - 4).isPowerOf2() ||
+       (Imm - 8).isPowerOf2()))
+    return true;
+
+  // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
+  // a pair of LUI/ADDI.
+  if (!Imm.isSignedIntN(12) && Imm.countr_zero() < 12 &&
+      ConstNode->hasOneUse()) {
+    APInt ImmS = Imm.ashr(Imm.countr_zero());
+    if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2() ||
+        (1 - ImmS).isPowerOf2())
+      return true;
   }
 
   return false;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 50aa19446f880a..42b8248006d1fd 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1731,16 +1731,12 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
 
-  setOperationAction(ISD::ADDC, MVT::i32, Custom);
-  setOperationAction(ISD::ADDE, MVT::i32, Custom);
-  setOperationAction(ISD::SUBC, MVT::i32, Custom);
-  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+  setOperationAction(ISD::SUBC, MVT::i32, Legal);
+  setOperationAction(ISD::SUBE, MVT::i32, Legal);
 
   if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::ADDC, MVT::i64, Custom);
-    setOperationAction(ISD::ADDE, MVT::i64, Custom);
-    setOperationAction(ISD::SUBC, MVT::i64, Custom);
-    setOperationAction(ISD::SUBE, MVT::i64, Custom);
     setOperationAction(ISD::BITCAST, MVT::f64, Expand);
     setOperationAction(ISD::BITCAST, MVT::i64, Expand);
     setOperationAction(ISD::SELECT, MVT::i64, Expand);
@@ -1855,9 +1851,6 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::MULHU,     MVT::i64, Expand);
     setOperationAction(ISD::MULHS,     MVT::i64, Expand);
 
-    setOperationAction(ISD::UMULO,     MVT::i64, Custom);
-    setOperationAction(ISD::SMULO,     MVT::i64, Custom);
-
     setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
     setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
     setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
@@ -3105,110 +3098,6 @@ static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
   return DstReg128;
 }
 
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-
-  if (Op.getValueType() != MVT::i64)
-    return Op;
-
-  SDLoc dl(Op);
-  SDValue Src1 = Op.getOperand(0);
-  SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
-  SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
-                               DAG.getConstant(32, dl, MVT::i64));
-  Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
-
-  SDValue Src2 = Op.getOperand(1);
-  SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
-  SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
-                               DAG.getConstant(32, dl, MVT::i64));
-  Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
-
-
-  bool hasChain = false;
-  unsigned hiOpc = Op.getOpcode();
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Invalid opcode");
-  case ISD::ADDC: hiOpc = ISD::ADDE; break;
-  case ISD::ADDE: hasChain = true; break;
-  case ISD::SUBC: hiOpc = ISD::SUBE; break;
-  case ISD::SUBE: hasChain = true; break;
-  }
-  SDValue Lo;
-  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
-  if (hasChain) {
-    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
-                     Op.getOperand(2));
-  } else {
-    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
-  }
-  SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
-  SDValue Carry = Hi.getValue(1);
-
-  Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
-  Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
-  Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
-                   DAG.getConstant(32, dl, MVT::i64));
-
-  SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
-  SDValue Ops[2] = { Dst, Carry };
-  return DAG.getMergeValues(Ops, dl);
-}
-
-// Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
-// in LegalizeDAG.cpp except the order of arguments to the library function.
-static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
-                                const SparcTargetLowering &TLI)
-{
-  unsigned opcode = Op.getOpcode();
-  assert((opcode == ISD::UMULO || opcode == ISD::SMULO) && "Invalid Opcode.");
-
-  bool isSigned = (opcode == ISD::SMULO);
-  EVT VT = MVT::i64;
-  EVT WideVT = MVT::i128;
-  SDLoc dl(Op);
-  SDValue LHS = Op.getOperand(0);
-
-  if (LHS.getValueType() != VT)
-    return Op;
-
-  SDValue ShiftAmt = DAG.getConstant(63, dl, VT);
-
-  SDValue RHS = Op.getOperand(1);
-  SDValue HiLHS, HiRHS;
-  if (isSigned) {
-    HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
-    HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
-  } else {
-    HiLHS = DAG.getConstant(0, dl, VT);
-    HiRHS = DAG.getConstant(0, dl, MVT::i64);
-  }
-
-  SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
-
-  TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(isSigned);
-  SDValue MulResult = TLI.makeLibCall(DAG,
-                                      RTLIB::MUL_I128, WideVT,
-                                      Args, CallOptions, dl).first;
-  SDValue BottomHalf, TopHalf;
-  std::tie(BottomHalf, TopHalf) = DAG.SplitScalar(MulResult, dl, VT, VT);
-  if (isSigned) {
-    SDValue Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
-    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, Tmp1, ISD::SETNE);
-  } else {
-    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, DAG.getConstant(0, dl, VT),
-                           ISD::SETNE);
-  }
-  // MulResult is a node with an illegal type. Because such things are not
-  // generally permitted during this phase of legalization, ensure that
-  // nothing is left using the node. The above EXTRACT_ELEMENT nodes should have
-  // been folded.
-  assert(MulResult->use_empty() && "Illegally typed node still in use!");
-
-  SDValue Ops[2] = { BottomHalf, TopHalf } ;
-  return DAG.getMergeValues(Ops, dl);
-}
-
 static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering())) {
     // Expand with a fence.
@@ -3283,12 +3172,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FNEG:               return LowerFNEGorFABS(Op, DAG, isV9);
   case ISD::FP_EXTEND:          return LowerF128_FPEXTEND(Op, DAG, *this);
   case ISD::FP_ROUND:           return LowerF128_FPROUND(Op, DAG, *this);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
-  case ISD::UMULO:
-  case ISD::SMULO:              return LowerUMULO_SMULO(Op, DAG, *this);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:       return LowerATOMIC_LOAD_STORE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index fcc61d0a5e2f63..da9fc257792940 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -437,8 +437,6 @@ static size_t getSizeForInstFragment(const MCFragment *F) {
     return cast<MCDataFragment>(*F).getContents().size();
   case MCFragment::FT_Relaxable:
     return cast<MCRelaxableFragment>(*F).getContents().size();
-  case MCFragment::FT_CompactEncodedInst:
-    return cast<MCCompactEncodedInstFragment>(*F).getContents().size();
   }
 }
 
@@ -884,9 +882,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
       if (LabeledFragments.count(&F))
         Relaxable.clear();
 
-      if (F.getKind() == MCFragment::FT_Data ||
-          F.getKind() == MCFragment::FT_CompactEncodedInst)
-        // Skip and ignore
+      if (F.getKind() == MCFragment::FT_Data) // Skip and ignore
         continue;
 
       if (F.getKind() == MCFragment::FT_Relaxable) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7f2c98db833044..4329fbfabefb26 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3432,12 +3432,9 @@ X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
   if (BaseCost >= 0 && Subtarget.hasCCMP())
     BaseCost += BrMergingCcmpBias;
   // a == b && a == c is a fast pattern on x86.
-  ICmpInst::Predicate Pred;
   if (BaseCost >= 0 && Opc == Instruction::And &&
-      match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
-      Pred == ICmpInst::ICMP_EQ &&
-      match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
-      Pred == ICmpInst::ICMP_EQ)
+      match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
+      match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
     BaseCost += 1;
   return {BaseCost, BrMergingLikelyBias.getValue(),
           BrMergingUnlikelyBias.getValue()};
@@ -30755,10 +30752,12 @@ static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
     if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
     if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
-        return Pred == CmpInst::ICMP_SLT;
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-        return Pred == CmpInst::ICMP_SGT;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))
+        return true;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+        return true;
     }
     return false;
   }
@@ -30766,10 +30765,12 @@ static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
     if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
-        return Pred == CmpInst::ICMP_SLT;
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-        return Pred == CmpInst::ICMP_SGT;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))
+        return true;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+        return true;
     }
     return false;
   }
@@ -30780,18 +30781,21 @@ static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
     if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
              Pred == CmpInst::ICMP_SLT;
-    if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-      return Pred == CmpInst::ICMP_SGT;
+    if (match(I->user_back(),
+              m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+      return true;
     return false;
   }
   if (Opc == AtomicRMWInst::Xor) {
     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
     if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
-        return Pred == CmpInst::ICMP_SLT;
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-        return Pred == CmpInst::ICMP_SGT;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))
+        return true;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+        return true;
     }
     return false;
   }
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index d5a38ec17a2a84..09ffc2d184f18b 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -135,15 +135,12 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
   if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))
     return false;
 
-  ICmpInst::Predicate Pred;
   BasicBlock *PhiBB = Phi.getParent();
-  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()),
+  if (!match(TermI, m_Br(m_SpecificICmp(CmpInst::ICMP_EQ, m_Specific(ShAmt),
+                                        m_ZeroInt()),
                          m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))
     return false;
 
-  if (Pred != CmpInst::ICMP_EQ)
-    return false;
-
   IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
 
   if (ShVal0 == ShVal1)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0a55f4762fdf0f..3bd086230cbec5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1694,12 +1694,10 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
 
   // Canonicalize signum variant that ends in add:
   // (A s>> (BW - 1)) + (zext (A s> 0)) --> (A s>> (BW - 1)) | (zext (A != 0))
-  ICmpInst::Predicate Pred;
   uint64_t BitWidth = Ty->getScalarSizeInBits();
   if (match(LHS, m_AShr(m_Value(A), m_SpecificIntAllowPoison(BitWidth - 1))) &&
-      match(RHS, m_OneUse(m_ZExt(
-                     m_OneUse(m_ICmp(Pred, m_Specific(A), m_ZeroInt()))))) &&
-      Pred == CmpInst::ICMP_SGT) {
+      match(RHS, m_OneUse(m_ZExt(m_OneUse(m_SpecificICmp(
+                     CmpInst::ICMP_SGT, m_Specific(A), m_ZeroInt())))))) {
     Value *NotZero = Builder.CreateIsNotNull(A, "isnotnull");
     Value *Zext = Builder.CreateZExt(NotZero, Ty, "isnotnull.zext");
     return BinaryOperator::CreateOr(LHS, Zext);
@@ -1711,12 +1709,13 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
     // (add X, (sext/zext (icmp eq X, C)))
     //    -> (select (icmp eq X, C), (add C, (sext/zext 1)), X)
     auto CondMatcher = m_CombineAnd(
-        m_Value(Cond), m_ICmp(Pred, m_Deferred(A), m_ImmConstant(C)));
+        m_Value(Cond),
+        m_SpecificICmp(ICmpInst::ICMP_EQ, m_Deferred(A), m_ImmConstant(C)));
 
     if (match(&I,
               m_c_Add(m_Value(A),
                       m_CombineAnd(m_Value(Ext), m_ZExtOrSExt(CondMatcher)))) &&
-        Pred == ICmpInst::ICMP_EQ && Ext->hasOneUse()) {
+        Ext->hasOneUse()) {
       Value *Add = isa<ZExtInst>(Ext) ? InstCombiner::AddOne(C)
                                       : InstCombiner::SubOne(C);
       return replaceInstUsesWith(I, Builder.CreateSelect(Cond, Add, A));
@@ -1791,6 +1790,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   // -->
   // BW - ctlz(A - 1, false)
   const APInt *XorC;
+  ICmpInst::Predicate Pred;
   if (match(&I,
             m_c_Add(
                 m_ZExt(m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(A)),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index f9caa4da44931a..4ca12d5b92f186 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -818,11 +818,11 @@ static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
   // Match  icmp ult (add %arg, C01), C1   (C1 == C01 << 1; powers of two)
   auto tryToMatchSignedTruncationCheck = [](ICmpInst *ICmp, Value *&X,
                                             APInt &SignBitMask) -> bool {
-    CmpInst::Predicate Pred;
     const APInt *I01, *I1; // powers of two; I1 == I01 << 1
-    if (!(match(ICmp,
-                m_ICmp(Pred, m_Add(m_Value(X), m_Power2(I01)), m_Power2(I1))) &&
-          Pred == ICmpInst::ICMP_ULT && I1->ugt(*I01) && I01->shl(1) == *I1))
+    if (!(match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT,
+                                     m_Add(m_Value(X), m_Power2(I01)),
+                                     m_Power2(I1))) &&
+          I1->ugt(*I01) && I01->shl(1) == *I1))
       return false;
     // Which bit is the new sign bit as per the 'signed truncation' pattern?
     SignBitMask = *I01;
@@ -936,20 +936,21 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
     std::swap(Cmp0, Cmp1);
 
   // (X != 0) && (ctpop(X) u< 2) --> ctpop(X) == 1
-  CmpInst::Predicate Pred0, Pred1;
   Value *X;
-  if (JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
-      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
-                         m_SpecificInt(2))) &&
-      Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_ULT) {
+  if (JoinedByAnd &&
+      match(Cmp0, m_SpecificICmp(ICmpInst::ICMP_NE, m_Value(X), m_ZeroInt())) &&
+      match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_ULT,
+                                 m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+                                 m_SpecificInt(2)))) {
     Value *CtPop = Cmp1->getOperand(0);
     return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1));
   }
   // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1
-  if (!JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
-      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
-                         m_SpecificInt(1))) &&
-      Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_UGT) {
+  if (!JoinedByAnd &&
+      match(Cmp0, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(X), m_ZeroInt())) &&
+      match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_UGT,
+                                 m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+                                 m_SpecificInt(1)))) {
     Value *CtPop = Cmp1->getOperand(0);
     return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1));
   }
@@ -1608,31 +1609,30 @@ static Instruction *reassociateFCmps(BinaryOperator &BO,
   // There are 4 commuted variants of the pattern. Canonicalize operands of this
   // logic op so an fcmp is operand 0 and a matching logic op is operand 1.
   Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1), *X;
-  FCmpInst::Predicate Pred;
-  if (match(Op1, m_FCmp(Pred, m_Value(), m_AnyZeroFP())))
+  if (match(Op1, m_FCmp(m_Value(), m_AnyZeroFP())))
     std::swap(Op0, Op1);
 
   // Match inner binop and the predicate for combining 2 NAN checks into 1.
   Value *BO10, *BO11;
   FCmpInst::Predicate NanPred = Opcode == Instruction::And ? FCmpInst::FCMP_ORD
                                                            : FCmpInst::FCMP_UNO;
-  if (!match(Op0, m_FCmp(Pred, m_Value(X), m_AnyZeroFP())) || Pred != NanPred ||
+  if (!match(Op0, m_SpecificFCmp(NanPred, m_Value(X), m_AnyZeroFP())) ||
       !match(Op1, m_BinOp(Opcode, m_Value(BO10), m_Value(BO11))))
     return nullptr;
 
   // The inner logic op must have a matching fcmp operand.
   Value *Y;
-  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
-      Pred != NanPred || X->getType() != Y->getType())
+  if (!match(BO10, m_SpecificFCmp(NanPred, m_Value(Y), m_AnyZeroFP())) ||
+      X->getType() != Y->getType())
     std::swap(BO10, BO11);
 
-  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
-      Pred != NanPred || X->getType() != Y->getType())
+  if (!match(BO10, m_SpecificFCmp(NanPred, m_Value(Y), m_AnyZeroFP())) ||
+      X->getType() != Y->getType())
     return nullptr;
 
   // and (fcmp ord X, 0), (and (fcmp ord Y, 0), Z) --> and (fcmp ord X, Y), Z
   // or  (fcmp uno X, 0), (or  (fcmp uno Y, 0), Z) --> or  (fcmp uno X, Y), Z
-  Value *NewFCmp = Builder.CreateFCmp(Pred, X, Y);
+  Value *NewFCmp = Builder.CreateFCmp(NanPred, X, Y);
   if (auto *NewFCmpInst = dyn_cast<FCmpInst>(NewFCmp)) {
     // Intersect FMF from the 2 source fcmps.
     NewFCmpInst->copyIRFlags(Op0);
@@ -1744,14 +1744,13 @@ Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
   //   -> zext(bitwise(A < 0, icmp))
   auto FoldBitwiseICmpZeroWithICmp = [&](Value *Op0,
                                          Value *Op1) -> Instruction * {
-    ICmpInst::Predicate Pred;
     Value *A;
     bool IsMatched =
         match(Op0,
               m_OneUse(m_LShr(
                   m_Value(A),
                   m_SpecificInt(Op0->getType()->getScalarSizeInBits() - 1)))) &&
-        match(Op1, m_OneUse(m_ZExt(m_ICmp(Pred, m_Value(), m_Value()))));
+        match(Op1, m_OneUse(m_ZExt(m_ICmp(m_Value(), m_Value()))));
 
     if (!IsMatched)
       return nullptr;
@@ -3878,14 +3877,14 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (match(&I,
             m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_Value(UMulWithOv)),
                                 m_Value(Ov)),
-                   m_CombineAnd(m_ICmp(Pred,
-                                       m_CombineAnd(m_ExtractValue<0>(
-                                                        m_Deferred(UMulWithOv)),
-                                                    m_Value(Mul)),
-                                       m_ZeroInt()),
-                                m_Value(MulIsNotZero)))) &&
-      (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse())) &&
-      Pred == CmpInst::ICMP_NE) {
+                   m_CombineAnd(
+                       m_SpecificICmp(ICmpInst::ICMP_NE,
+                                      m_CombineAnd(m_ExtractValue<0>(
+                                                       m_Deferred(UMulWithOv)),
+                                                   m_Value(Mul)),
+                                      m_ZeroInt()),
+                       m_Value(MulIsNotZero)))) &&
+      (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse()))) {
     Value *A, *B;
     if (match(UMulWithOv, m_Intrinsic<Intrinsic::umul_with_overflow>(
                               m_Value(A), m_Value(B)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f6c4b6e1809374..e6a00c116c9f0c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1500,10 +1500,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   // Don't try to simplify calls without uses. It will not do anything useful,
   // but will result in the following folds being skipped.
   if (!CI.use_empty()) {
-    SmallVector<Value *, 4> Args;
-    Args.reserve(CI.arg_size());
-    for (Value *Op : CI.args())
-      Args.push_back(Op);
+    SmallVector<Value *, 8> Args(CI.args());
     if (Value *V = simplifyCall(&CI, CI.getCalledOperand(), Args,
                                 SQ.getWithInstruction(&CI)))
       return replaceInstUsesWith(CI, V);
@@ -3031,10 +3028,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // assume( (load addr) != null ) -> add 'nonnull' metadata to load
     // (if assume is valid at the load)
-    CmpInst::Predicate Pred;
     Instruction *LHS;
-    if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
-        Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_NE, m_Instruction(LHS),
+                                        m_Zero())) &&
+        LHS->getOpcode() == Instruction::Load &&
         LHS->getType()->isPointerTy() &&
         isValidAssumeForContext(II, LHS, &DT)) {
       MDNode *MD = MDNode::get(II->getContext(), std::nullopt);
@@ -3073,8 +3070,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // into
     // call void @llvm.assume(i1 true) [ "nonnull"(i32* %PTR) ]
     if (EnableKnowledgeRetention &&
-        match(IIOperand, m_Cmp(Pred, m_Value(A), m_Zero())) &&
-        Pred == CmpInst::ICMP_NE && A->getType()->isPointerTy()) {
+        match(IIOperand,
+              m_SpecificICmp(ICmpInst::ICMP_NE, m_Value(A), m_Zero())) &&
+        A->getType()->isPointerTy()) {
       if (auto *Replacement = buildAssumeFromKnowledge(
               {RetainedKnowledge{Attribute::NonNull, 0, A}}, Next, &AC, &DT)) {
 
@@ -3094,9 +3092,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     uint64_t AlignMask;
     if (EnableKnowledgeRetention &&
         match(IIOperand,
-              m_Cmp(Pred, m_And(m_Value(A), m_ConstantInt(AlignMask)),
-                    m_Zero())) &&
-        Pred == CmpInst::ICMP_EQ) {
+              m_SpecificICmp(ICmpInst::ICMP_EQ,
+                             m_And(m_Value(A), m_ConstantInt(AlignMask)),
+                             m_Zero()))) {
       if (isPowerOf2_64(AlignMask + 1)) {
         uint64_t Offset = 0;
         match(A, m_Add(m_Value(A), m_ConstantInt(Offset)));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index abadf54a967678..3b6df2760ecc24 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5772,8 +5772,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   //    -> icmp eq/ne X, rotate-left(X)
   // We generally try to convert rotate-right -> rotate-left, this just
   // canonicalizes another case.
-  CmpInst::Predicate PredUnused = Pred;
-  if (match(&I, m_c_ICmp(PredUnused, m_Value(A),
+  if (match(&I, m_c_ICmp(m_Value(A),
                          m_OneUse(m_Intrinsic<Intrinsic::fshr>(
                              m_Deferred(A), m_Deferred(A), m_Value(B))))))
     return new ICmpInst(
@@ -5783,8 +5782,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   // Canonicalize:
   // icmp eq/ne OneUse(A ^ Cst), B --> icmp eq/ne (A ^ B), Cst
   Constant *Cst;
-  if (match(&I, m_c_ICmp(PredUnused,
-                         m_OneUse(m_Xor(m_Value(A), m_ImmConstant(Cst))),
+  if (match(&I, m_c_ICmp(m_OneUse(m_Xor(m_Value(A), m_ImmConstant(Cst))),
                          m_CombineAnd(m_Value(B), m_Unless(m_ImmConstant())))))
     return new ICmpInst(Pred, Builder.CreateXor(A, B), Cst);
 
@@ -5795,13 +5793,12 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
                                 m_c_Xor(m_Value(B), m_Deferred(A))),
                     m_Sub(m_Value(B), m_Deferred(A)));
     std::optional<bool> IsZero = std::nullopt;
-    if (match(&I, m_c_ICmp(PredUnused, m_OneUse(m_c_And(m_Value(A), m_Matcher)),
+    if (match(&I, m_c_ICmp(m_OneUse(m_c_And(m_Value(A), m_Matcher)),
                            m_Deferred(A))))
       IsZero = false;
     // (icmp eq/ne (and (add/sub/xor X, P2), P2), 0)
     else if (match(&I,
-                   m_ICmp(PredUnused, m_OneUse(m_c_And(m_Value(A), m_Matcher)),
-                          m_Zero())))
+                   m_ICmp(m_OneUse(m_c_And(m_Value(A), m_Matcher)), m_Zero())))
       IsZero = true;
 
     if (IsZero && isKnownToBeAPowerOfTwo(A, /* OrZero */ true, /*Depth*/ 0, &I))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 1505b899a06df2..aef55b43d10fe3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2469,9 +2469,8 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
 
   // Finally, see if the select is filtering out a shift-by-zero.
   Value *Cond = Sel.getCondition();
-  ICmpInst::Predicate Pred;
-  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) ||
-      Pred != ICmpInst::ICMP_EQ)
+  if (!match(Cond, m_OneUse(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(ShAmt),
+                                           m_ZeroInt()))))
     return nullptr;
 
   // If this is not a rotate then the select was blocking poison from the
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8a6ec3076ac621..c494fec84c1e6e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -388,8 +388,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
       // invert the transform that reduces set bits and infinite-loop.
       Value *X;
       const APInt *CmpC;
-      ICmpInst::Predicate Pred;
-      if (!match(I->getOperand(0), m_ICmp(Pred, m_Value(X), m_APInt(CmpC))) ||
+      if (!match(I->getOperand(0), m_ICmp(m_Value(X), m_APInt(CmpC))) ||
           isa<Constant>(X) || CmpC->getBitWidth() != SelC->getBitWidth())
         return ShrinkDemandedConstant(I, OpNo, DemandedMask);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 0d8e7e92c5c8e5..0fb8b639c97b95 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1651,14 +1651,14 @@ static Constant *constantFoldOperationIntoSelectOperand(Instruction &I,
                                                         bool IsTrueArm) {
   SmallVector<Constant *> ConstOps;
   for (Value *Op : I.operands()) {
-    CmpInst::Predicate Pred;
     Constant *C = nullptr;
     if (Op == SI) {
       C = dyn_cast<Constant>(IsTrueArm ? SI->getTrueValue()
                                        : SI->getFalseValue());
     } else if (match(SI->getCondition(),
-                     m_ICmp(Pred, m_Specific(Op), m_Constant(C))) &&
-               Pred == (IsTrueArm ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE) &&
+                     m_SpecificICmp(IsTrueArm ? ICmpInst::ICMP_EQ
+                                              : ICmpInst::ICMP_NE,
+                                    m_Specific(Op), m_Constant(C))) &&
                isGuaranteedNotToBeUndefOrPoison(C)) {
       // Pass
     } else {
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 4371b821eae63f..d2268f0a30a25d 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -149,9 +149,7 @@ class DFAJumpThreading {
   unfoldSelectInstrs(DominatorTree *DT,
                      const SmallVector<SelectInstToUnfold, 4> &SelectInsts) {
     DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-    SmallVector<SelectInstToUnfold, 4> Stack;
-    for (SelectInstToUnfold SIToUnfold : SelectInsts)
-      Stack.push_back(SIToUnfold);
+    SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts);
 
     while (!Stack.empty()) {
       SelectInstToUnfold SIToUnfold = Stack.pop_back_val();
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 931606c6f8fe12..992139a95a43d3 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1889,12 +1889,12 @@ struct DSEState {
         return true;
       auto *Ptr = Memset->getArgOperand(0);
       auto *TI = MallocBB->getTerminator();
-      ICmpInst::Predicate Pred;
       BasicBlock *TrueBB, *FalseBB;
-      if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
-                          FalseBB)))
+      if (!match(TI, m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(Ptr),
+                                         m_Zero()),
+                          TrueBB, FalseBB)))
         return false;
-      if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
+      if (MemsetBB != FalseBB)
         return false;
       return true;
     };
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 6092cd1bc08beb..ff077624802be2 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -160,9 +160,8 @@ static bool isProcessableCondBI(const ScalarEvolution &SE,
                                 const BranchInst *BI) {
   BasicBlock *TrueSucc = nullptr;
   BasicBlock *FalseSucc = nullptr;
-  ICmpInst::Predicate Pred;
   Value *LHS, *RHS;
-  if (!match(BI, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)),
+  if (!match(BI, m_Br(m_ICmp(m_Value(LHS), m_Value(RHS)),
                       m_BasicBlock(TrueSucc), m_BasicBlock(FalseSucc))))
     return false;
 
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 1696e9c7267354..5af56b01cb69c4 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -565,9 +565,8 @@ void Mapper::remapDbgRecord(DbgRecord &DR) {
   }
 
   // Find Value operands and remap those.
-  SmallVector<Value *, 4> Vals, NewVals;
-  for (Value *Val : V.location_ops())
-    Vals.push_back(Val);
+  SmallVector<Value *, 4> Vals(V.location_ops());
+  SmallVector<Value *, 4> NewVals;
   for (Value *Val : Vals)
     NewVals.push_back(mapValue(Val));
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 64e04cae2773fc..cb31e2a2ecaec4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -268,25 +268,25 @@ bool LoopIdiomVectorize::recognizeByteCompare() {
             return false;
 
   // Match the branch instruction for the header
-  ICmpInst::Predicate Pred;
   Value *MaxLen;
   BasicBlock *EndBB, *WhileBB;
   if (!match(Header->getTerminator(),
-             m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(Index),
+                                 m_Value(MaxLen)),
                   m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) ||
-      Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB))
+      !CurLoop->contains(WhileBB))
     return false;
 
   // WhileBB should contain the pattern of load & compare instructions. Match
   // the pattern and find the GEP instructions used by the loads.
-  ICmpInst::Predicate WhilePred;
   BasicBlock *FoundBB;
   BasicBlock *TrueBB;
   Value *LoadA, *LoadB;
   if (!match(WhileBB->getTerminator(),
-             m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(LoadA),
+                                 m_Value(LoadB)),
                   m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) ||
-      WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB))
+      !CurLoop->contains(TrueBB))
     return false;
 
   Value *A, *B;
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
deleted file mode 100644
index 72a3392891592b..00000000000000
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=ALL %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
-
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; END.
-
-define i32 @fcopysign(i32 %arg) {
-; ALL-LABEL: 'fcopysign'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'fcopysign'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-  %F32 = call float @llvm.copysign.f32(float undef, float undef)
-  %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-  %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-  %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-
-  %F64 = call double @llvm.copysign.f64(double undef, double undef)
-  %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-  %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-  %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-
-  ret i32 undef
-}
-
-define i32 @fsqrt(i32 %arg) {
-; ALL-LABEL: 'fsqrt'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'fsqrt'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-  %F32 = call float @llvm.sqrt.f32(float undef)
-  %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-  %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-  %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-
-  %F64 = call double @llvm.sqrt.f64(double undef)
-  %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-  %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-  %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-
-  ret i32 undef
-}
-
-declare float @llvm.copysign.f32(float, float)
-declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.copysign.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.copysign.v16f32(<16 x float>, <16 x float>)
-
-declare double @llvm.copysign.f64(double, double)
-declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.copysign.v8f64(<8 x double>, <8 x double>)
-
-declare float @llvm.sqrt.f32(float)
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
-declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
-
-declare double @llvm.sqrt.f64(double)
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
-declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
-declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arithmetic_fence.ll b/llvm/test/Analysis/CostModel/AMDGPU/arithmetic_fence.ll
new file mode 100644
index 00000000000000..2cee15193a5030
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arithmetic_fence.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
+
+define void @arithmetic_fence_f16() {
+; ALL-LABEL: 'arithmetic_fence_f16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.arithmetic.fence.f16(half undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.arithmetic.fence.v3f16(<3 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.arithmetic.fence.v4f16(<4 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.arithmetic.fence.v5f16(<5 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.arithmetic.fence.v8f16(<8 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.arithmetic.fence.v16f16(<16 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.arithmetic.fence.v17f16(<17 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_f16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.arithmetic.fence.f16(half undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.arithmetic.fence.v3f16(<3 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.arithmetic.fence.v4f16(<4 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.arithmetic.fence.v5f16(<5 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.arithmetic.fence.v8f16(<8 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.arithmetic.fence.v16f16(<16 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.arithmetic.fence.v17f16(<17 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.arithmetic.fence.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.arithmetic.fence.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.arithmetic.fence.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.arithmetic.fence.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.arithmetic.fence.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.arithmetic.fence.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.arithmetic.fence.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @arithmetic_fence_bf16() {
+; ALL-LABEL: 'arithmetic_fence_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.arithmetic.fence.bf16(bfloat undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.arithmetic.fence.v2bf16(<2 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.arithmetic.fence.v5bf16(<5 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.arithmetic.fence.v8bf16(<8 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.arithmetic.fence.v16bf16(<16 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.arithmetic.fence.v17bf16(<17 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_bf16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.arithmetic.fence.bf16(bfloat undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.arithmetic.fence.v2bf16(<2 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.arithmetic.fence.v5bf16(<5 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.arithmetic.fence.v8bf16(<8 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.arithmetic.fence.v16bf16(<16 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.arithmetic.fence.v17bf16(<17 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.arithmetic.fence.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.arithmetic.fence.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.arithmetic.fence.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.arithmetic.fence.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.arithmetic.fence.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.arithmetic.fence.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @arithmetic_fence_f32() {
+; ALL-LABEL: 'arithmetic_fence_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.arithmetic.fence.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.arithmetic.fence.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.arithmetic.fence.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.arithmetic.fence.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.arithmetic.fence.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.arithmetic.fence.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.arithmetic.fence.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.arithmetic.fence.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.arithmetic.fence.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.arithmetic.fence.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.arithmetic.fence.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.arithmetic.fence.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.arithmetic.fence.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.arithmetic.fence.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.arithmetic.fence.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.arithmetic.fence.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.arithmetic.fence.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.arithmetic.fence.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @arithmetic_fence_f64() {
+; ALL-LABEL: 'arithmetic_fence_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.arithmetic.fence.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.arithmetic.fence.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.arithmetic.fence.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.arithmetic.fence.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.arithmetic.fence.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.arithmetic.fence.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.arithmetic.fence.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.arithmetic.fence.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.arithmetic.fence.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.arithmetic.fence.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.arithmetic.fence.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.arithmetic.fence.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.arithmetic.fence.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.arithmetic.fence.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.arithmetic.fence.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.arithmetic.fence.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.arithmetic.fence.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.arithmetic.fence.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.arithmetic.fence.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.arithmetic.fence.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.arithmetic.fence.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.arithmetic.fence.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.arithmetic.fence.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.arithmetic.fence.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
new file mode 100644
index 00000000000000..e980c910480504
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @canonicalize_f16() {
+; BASE-LABEL: 'canonicalize_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'canonicalize_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'canonicalize_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'canonicalize_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'canonicalize_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'canonicalize_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'canonicalize_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'canonicalize_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.canonicalize.f16(half undef) #1
+  %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) #1
+  %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef) #1
+  %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) #1
+  %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef) #1
+  %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef) #1
+  %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef) #1
+  ret void
+}
+
+define void @canonicalize_bf16() {
+; BASE-LABEL: 'canonicalize_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'canonicalize_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'canonicalize_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'canonicalize_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'canonicalize_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'canonicalize_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'canonicalize_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'canonicalize_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1
+  %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1
+  %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1
+  %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) #1
+  %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef) #1
+  %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef) #1
+  %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) #1
+  ret void
+}
+
+define void @canonicalize_f32() {
+; ALL-LABEL: 'canonicalize_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'canonicalize_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.canonicalize.f32(float undef) #1
+  %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1
+  %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1
+  %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef) #1
+  %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef) #1
+  %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef) #1
+  %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef) #1
+  %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) #1
+  ret void
+}
+
+define void @canonicalize_f64() {
+; ALL-LABEL: 'canonicalize_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'canonicalize_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.canonicalize.f64(double undef) #1
+  %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1
+  %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef) #1
+  %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef) #1
+  %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef) #1
+  %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) #1
+  %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1
+  ret void
+}
+
+
+
+
+
+
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
new file mode 100644
index 00000000000000..3b7b1b4238b8a7
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @copysign_f16() {
+; BASE-LABEL: 'copysign_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'copysign_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'copysign_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'copysign_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'copysign_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'copysign_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'copysign_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'copysign_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.copysign.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+  %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+  %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @copysign_f32() {
+; ALL-LABEL: 'copysign_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.copysign.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.copysign.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.copysign.v5f32(<5 x float> undef, <5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.copysign.v9f32(<9 x float> undef, <9 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'copysign_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.copysign.f32(float undef, float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.copysign.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.copysign.v5f32(<5 x float> undef, <5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.copysign.v9f32(<9 x float> undef, <9 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.copysign.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.copysign.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.copysign.v5f32(<5 x float> undef, <5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+  %v9f32 = call <9 x float> @llvm.copysign.v9f32(<9 x float> undef, <9 x float> undef)
+  %v16f32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @copysign_bf16() {
+; BASE-LABEL: 'copysign_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'copysign_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'copysign_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'copysign_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'copysign_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'copysign_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'copysign_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'copysign_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @copysign_f64() {
+; ALL-LABEL: 'copysign_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'copysign_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.copysign.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+  %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+  %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/exp.ll b/llvm/test/Analysis/CostModel/AMDGPU/exp.ll
new file mode 100644
index 00000000000000..a94b794f89a3d5
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/exp.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @exp_f16() {
+; BASE-LABEL: 'exp_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.exp.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @exp_bf16() {
+; BASE-LABEL: 'exp_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @exp_f32() {
+; ALL-LABEL: 'exp_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.exp.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.exp.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.exp.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.exp.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @exp_f64() {
+; ALL-LABEL: 'exp_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.exp.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.exp.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.exp.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.exp.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.exp.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.exp.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.exp.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.exp.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.exp.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.exp.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.exp.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.exp.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/exp10.ll b/llvm/test/Analysis/CostModel/AMDGPU/exp10.ll
new file mode 100644
index 00000000000000..0fea64d0b94f7a
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/exp10.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @exp10_f16() {
+; BASE-LABEL: 'exp10_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp10_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp10_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp10_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp10_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp10_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp10_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp10_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.exp10.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @exp10_bf16() {
+; BASE-LABEL: 'exp10_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp10_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp10_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp10_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp10_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp10_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp10_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp10_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @exp10_f32() {
+; ALL-LABEL: 'exp10_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp10.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp10.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp10.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp10.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp10.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp10.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp10.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp10.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp10_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp10.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp10.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp10.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp10.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp10.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp10.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp10.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp10.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.exp10.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.exp10.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.exp10.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.exp10.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.exp10.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.exp10.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.exp10.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.exp10.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @exp10_f64() {
+; ALL-LABEL: 'exp10_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.exp10.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.exp10.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.exp10.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.exp10.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.exp10.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.exp10.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.exp10.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.exp10.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp10_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.exp10.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.exp10.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.exp10.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.exp10.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.exp10.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.exp10.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.exp10.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.exp10.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.exp10.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.exp10.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.exp10.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.exp10.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.exp10.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.exp10.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.exp10.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.exp10.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/exp2.ll b/llvm/test/Analysis/CostModel/AMDGPU/exp2.ll
new file mode 100644
index 00000000000000..d1ff5e1551db32
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/exp2.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @exp2_f16() {
+; BASE-LABEL: 'exp2_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp2_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp2_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp2_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp2_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp2_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp2_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp2_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.exp2.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @exp2_bf16() {
+; BASE-LABEL: 'exp2_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp2_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp2_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp2_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp2_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp2_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp2_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp2_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @exp2_f32() {
+; ALL-LABEL: 'exp2_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp2.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp2.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp2.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp2.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp2_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp2.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp2.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp2.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp2.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.exp2.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.exp2.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.exp2.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.exp2.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @exp2_f64() {
+; ALL-LABEL: 'exp2_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.exp2.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.exp2.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.exp2.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.exp2.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp2_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.exp2.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.exp2.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.exp2.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.exp2.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.exp2.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.exp2.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.exp2.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.exp2.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
index daad19e42f3c40..da198ad9c028c8 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
@@ -1,116 +1,139 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; END.
 
-define amdgpu_kernel void @fabs_f32() #0 {
-; ALL-LABEL: 'fabs_f32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = call <9 x float> @llvm.fabs.v9f32(<9 x float> undef) #2
+define void @fabs_f16() {
+; ALL-LABEL: 'fabs_f16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-SIZE-LABEL: 'fabs_f32'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = call <9 x float> @llvm.fabs.v9f32(<9 x float> undef) #2
+; ALL-SIZE-LABEL: 'fabs_f16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = call float @llvm.fabs.f32(float undef) #1
-  %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #1
-  %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #1
-  %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) #1
-  %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #1
-  %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) #1
-  %v9f32 = call <9 x float> @llvm.fabs.v9f32(<9 x float> undef) #1
+  %f16 = call half @llvm.fabs.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef)
   ret void
 }
 
-define amdgpu_kernel void @fabs_f64() #0 {
-; ALL-LABEL: 'fabs_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef) #2
+define void @fabs_bf16() {
+; ALL-LABEL: 'fabs_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.fabs.bf16(bfloat undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fabs.v3bf16(<3 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fabs.v5bf16(<5 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fabs.v17bf16(<17 x bfloat> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-SIZE-LABEL: 'fabs_f64'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef) #2
+; ALL-SIZE-LABEL: 'fabs_bf16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.fabs.bf16(bfloat undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fabs.v3bf16(<3 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fabs.v5bf16(<5 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fabs.v17bf16(<17 x bfloat> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = call double @llvm.fabs.f64(double undef) #1
-  %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #1
-  %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #1
-  %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) #1
-  %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef) #1
+  %bf16 = call bfloat @llvm.fabs.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.fabs.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.fabs.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.fabs.v17bf16(<17 x bfloat> undef)
   ret void
 }
 
-define amdgpu_kernel void @fabs_f16() #0 {
-; ALL-LABEL: 'fabs_f16'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef) #2
+define void @fabs_f32() {
+; ALL-LABEL: 'fabs_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.fabs.v17f32(<17 x float> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-SIZE-LABEL: 'fabs_f16'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef) #2
+; ALL-SIZE-LABEL: 'fabs_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.fabs.v17f32(<17 x float> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = call half @llvm.fabs.f16(half undef) #1
-  %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #1
-  %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #1
-  %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) #1
-  %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef) #1
-  %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) #1
-  %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef) #1
+  %f32 = call float @llvm.fabs.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.fabs.v17f32(<17 x float> undef)
   ret void
 }
 
-declare float @llvm.fabs.f32(float) #1
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #1
-declare <3 x float> @llvm.fabs.v3f32(<3 x float>) #1
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1
-declare <5 x float> @llvm.fabs.v5f32(<5 x float>) #1
-declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #1
-declare <9 x float> @llvm.fabs.v9f32(<9 x float>) #1
-
-declare double @llvm.fabs.f64(double) #1
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #1
-declare <3 x double> @llvm.fabs.v3f64(<3 x double>) #1
-declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #1
-declare <5 x double> @llvm.fabs.v5f64(<5 x double>) #1
-
-declare half @llvm.fabs.f16(half) #1
-declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
-declare <3 x half> @llvm.fabs.v3f16(<3 x half>) #1
-declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
-declare <5 x half> @llvm.fabs.v5f16(<5 x half>) #1
-declare <16 x half> @llvm.fabs.v16f16(<16 x half>) #1
-declare <17 x half> @llvm.fabs.v17f16(<17 x half>) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+define void @fabs_f64() {
+; ALL-LABEL: 'fabs_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.fabs.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'fabs_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.fabs.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.fabs.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.fabs.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index ab4e98201f6d78..3e58e971ce1ca4 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -1,214 +1,167 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,FASTF64 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,FASTF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW-SIZE %s
-; END.
 
-define amdgpu_kernel void @fma_f32() #0 {
-; SLOWF64-LABEL: 'fma_f32'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; FASTF64-LABEL: 'fma_f32'
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s
+
+
+define void @fma_f16() {
+; FAST-LABEL: 'fma_f16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOW-LABEL: 'fma_f32'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-LABEL: 'fma_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOWF64-SIZE-LABEL: 'fma_f32'
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; FASTF64-SIZE-LABEL: 'fma_f32'
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; FAST-SIZE-LABEL: 'fma_f16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; SLOW-SIZE-LABEL: 'fma_f32'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-SIZE-LABEL: 'fma_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #1
-  %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #1
-  %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #1
-  %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #1
-  %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #1
-  %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #1
-  %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #1
+  %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+  %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
   ret void
 }
 
-define amdgpu_kernel void @fma_f64() #0 {
-; SLOWF64-LABEL: 'fma_f64'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; FASTF64-LABEL: 'fma_f64'
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+define void @fma_bf16() {
+; FAST-LABEL: 'fma_bf16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOW-LABEL: 'fma_f64'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
+; SLOW-LABEL: 'fma_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOWF64-SIZE-LABEL: 'fma_f64'
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; FASTF64-SIZE-LABEL: 'fma_f64'
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; FAST-SIZE-LABEL: 'fma_bf16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; SLOW-SIZE-LABEL: 'fma_f64'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
+; SLOW-SIZE-LABEL: 'fma_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #1
-  %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #1
-  %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #1
-  %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #1
-  %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #1
+  %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
   ret void
 }
 
-define amdgpu_kernel void @fma_f16() #0 {
-; FAST-LABEL: 'fma_f16'
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; SLOW-LABEL: 'fma_f16'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
+define void @fma_f32() {
+; SLOW-LABEL: 'fma_f32'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; FAST-SIZE-LABEL: 'fma_f16'
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SLOW-SIZE-LABEL: 'fma_f16'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
+; SLOW-SIZE-LABEL: 'fma_f32'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #1
-  %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #1
-  %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #1
-  %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #1
-  %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #1
-  %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #1
-  %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #1
+  %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+  %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
   ret void
 }
 
-declare float @llvm.fma.f32(float, float, float) #1
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
-declare <3 x float> @llvm.fma.v3f32(<3 x float>, <3 x float>, <3 x float>) #1
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
-declare <5 x float> @llvm.fma.v5f32(<5 x float>, <5 x float>, <5 x float>) #1
-declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #1
-declare <9 x float> @llvm.fma.v9f32(<9 x float>, <9 x float>, <9 x float>) #1
-
-declare double @llvm.fma.f64(double, double, double) #1
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #1
-declare <3 x double> @llvm.fma.v3f64(<3 x double>, <3 x double>, <3 x double>) #1
-declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #1
-declare <5 x double> @llvm.fma.v5f64(<5 x double>, <5 x double>, <5 x double>) #1
-
-declare half @llvm.fma.f16(half, half, half) #1
-declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
-declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>) #1
-declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) #1
-declare <5 x half> @llvm.fma.v5f16(<5 x half>, <5 x half>, <5 x half>) #1
-declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>) #1
-declare <17 x half> @llvm.fma.v17f16(<17 x half>, <17 x half>, <17 x half>) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+define void @fma_f64() {
+; SLOW-LABEL: 'fma_f64'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fma_f64'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index 2e4a9c70f3717b..adc4eea309a586 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -158,4 +158,55 @@ define amdgpu_kernel void @fmul_f16() #0 {
   ret void
 }
 
+define amdgpu_kernel void @fmul_bf16() #0 {
+; GFX9-LABEL: 'fmul_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'fmul_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'fmul_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmul_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = fmul bfloat undef, undef
+  %v2bf16 = fmul <2 x bfloat> undef, undef
+  %v3bf16 = fmul <3 x bfloat> undef, undef
+  %v4bf16 = fmul <4 x bfloat> undef, undef
+  %v5bf16 = fmul <5 x bfloat> undef, undef
+  %v16bf16 = fmul <16 x bfloat> undef, undef
+  %v17bf16 = fmul <17 x bfloat> undef, undef
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmuladd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmuladd.ll
new file mode 100644
index 00000000000000..5e2ac451b23748
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmuladd.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s
+
+
+define void @fmuladd_f16() {
+; FAST-LABEL: 'fmuladd_f16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'fmuladd_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'fmuladd_f16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+  %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+  ret void
+}
+
+define void @fmuladd_bf16() {
+; FAST-LABEL: 'fmuladd_bf16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'fmuladd_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'fmuladd_bf16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+  ret void
+}
+
+define void @fmuladd_f32() {
+; SLOW-LABEL: 'fmuladd_f32'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fmuladd.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fmuladd.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_f32'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fmuladd.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fmuladd.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.fmuladd.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+  %v9f32 = call <9 x float> @llvm.fmuladd.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+  ret void
+}
+
+define void @fmuladd_f64() {
+; SLOW-LABEL: 'fmuladd_f64'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2f64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v3f64 = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4f64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v5f64 = call <5 x double> @llvm.fmuladd.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_f64'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fmuladd.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.fmuladd.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
index 725466a3c1a055..9af5dd352d5883 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
@@ -3,7 +3,75 @@
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZE %s
 ; END.
 
-define amdgpu_kernel void @fneg_f32() {
+define void @fneg_f16() {
+; CHECK-LABEL: 'fneg_f16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'fneg_f16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = fneg half undef
+  %v2f16 = fneg <2 x half> undef
+  %v3f16 = fneg <3 x half> undef
+  %v4f16 = fneg <4 x half> undef
+  %v5f16 = fneg <5 x half> undef
+  %v8f16 = fneg <8 x half> undef
+  %v16f16 = fneg <16 x half> undef
+  %v17f16 = fneg <17 x half> undef
+  ret void
+}
+
+define void @fneg_bf16() {
+; CHECK-LABEL: 'fneg_bf16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg bfloat undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'fneg_bf16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg bfloat undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = fneg bfloat undef
+  %v2f16 = fneg <2 x bfloat> undef
+  %v3f16 = fneg <3 x bfloat> undef
+  %v4f16 = fneg <4 x bfloat> undef
+  %v5f16 = fneg <5 x bfloat> undef
+  %v8f16 = fneg <8 x bfloat> undef
+  %v16f16 = fneg <16 x bfloat> undef
+  %v17f16 = fneg <17 x bfloat> undef
+  ret void
+}
+
+define void @fneg_f32() {
 ; CHECK-LABEL: 'fneg_f32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = fneg float undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = fneg <2 x float> undef
@@ -12,6 +80,7 @@ define amdgpu_kernel void @fneg_f32() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = fneg <5 x float> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = fneg <8 x float> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = fneg <9 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = fneg <16 x float> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fneg_f32'
@@ -22,6 +91,7 @@ define amdgpu_kernel void @fneg_f32() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = fneg <5 x float> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = fneg <8 x float> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = fneg <9 x float> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = fneg <16 x float> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = fneg float undef
@@ -31,16 +101,19 @@ define amdgpu_kernel void @fneg_f32() {
   %v5f32 = fneg <5 x float> undef
   %v8f32 = fneg <8 x float> undef
   %v9f32 = fneg <9 x float> undef
+  %v16f32 = fneg <16 x float> undef
   ret void
 }
 
-define amdgpu_kernel void @fneg_f64() {
+define void @fneg_f64() {
 ; CHECK-LABEL: 'fneg_f64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = fneg double undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = fneg <2 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = fneg <3 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = fneg <4 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = fneg <5 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = fneg <8 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = fneg <16 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fneg_f64'
@@ -49,6 +122,8 @@ define amdgpu_kernel void @fneg_f64() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = fneg <3 x double> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = fneg <4 x double> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = fneg <5 x double> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = fneg <8 x double> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = fneg <16 x double> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = fneg double undef
@@ -56,37 +131,8 @@ define amdgpu_kernel void @fneg_f64() {
   %v3f64 = fneg <3 x double> undef
   %v4f64 = fneg <4 x double> undef
   %v5f64 = fneg <5 x double> undef
-  ret void
-}
-
-define amdgpu_kernel void @fneg_f16() {
-; CHECK-LABEL: 'fneg_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; SIZE-LABEL: 'fneg_f16'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %f16 = fneg half undef
-  %v2f16 = fneg <2 x half> undef
-  %v3f16 = fneg <3 x half> undef
-  %v4f16 = fneg <4 x half> undef
-  %v5f16 = fneg <5 x half> undef
-  %v16f16 = fneg <16 x half> undef
-  %v17f16 = fneg <17 x half> undef
+  %v8f64 = fneg <8 x double> undef
+  %v16f64 = fneg <16 x double> undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
new file mode 100644
index 00000000000000..22134d042fabb6
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL-SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+
+define void @frexp_f16_i32() {
+; GFX7-LABEL: 'frexp_f16_i32'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'frexp_f16_i32'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'frexp_f16_i32'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i32'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+  %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+  %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+  %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+  %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+  %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+  %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+  %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+  ret void
+}
+
+define void @frexp_f16_i16() {
+; GFX7-LABEL: 'frexp_f16_i16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'frexp_f16_i16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'frexp_f16_i16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+  %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+  %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+  %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+  %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+  %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+  %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+  %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+  ret void
+}
+
+define void @frexp_bf16() {
+; GFX7-LABEL: 'frexp_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'frexp_bf16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'frexp_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'frexp_bf16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+  %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+  %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+  %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+  %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+  %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+  %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+  %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+  ret void
+}
+
+define void @frexp_f32() {
+; ALL-LABEL: 'frexp_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call { <5 x float>, <5 x i32> } @llvm.frexp.v5f32.v5i32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call { <8 x float>, <8 x i32> } @llvm.frexp.v8f32.v8i32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call { <16 x float>, <16 x i32> } @llvm.frexp.v16f32.v16i32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call { <17 x float>, <17 x i32> } @llvm.frexp.v17f32.v17i32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'frexp_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call { <5 x float>, <5 x i32> } @llvm.frexp.v5f32.v5i32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call { <8 x float>, <8 x i32> } @llvm.frexp.v8f32.v8i32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call { <16 x float>, <16 x i32> } @llvm.frexp.v16f32.v16i32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call { <17 x float>, <17 x i32> } @llvm.frexp.v17f32.v17i32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+  %v2f32 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
+  %v3f32 = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> undef)
+  %v4f32 = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> undef)
+  %v5f32 = call { <5 x float>, <5 x i32> } @llvm.frexp.v5f32.v5i32(<5 x float> undef)
+  %v8f32 = call { <8 x float>, <8 x i32> } @llvm.frexp.v8f32.v8i32(<8 x float> undef)
+  %v16f32 = call { <16 x float>, <16 x i32> } @llvm.frexp.v16f32.v16i32(<16 x float> undef)
+  %v17f32 = call { <17 x float>, <17 x i32> } @llvm.frexp.v17f32.v17i32(<17 x float> undef)
+  ret void
+}
+
+define void @frexp_f64() {
+; ALL-LABEL: 'frexp_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call { double, i32 } @llvm.frexp.f64.i32(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call { <3 x double>, <3 x i32> } @llvm.frexp.v3f64.v3i32(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call { <4 x double>, <4 x i32> } @llvm.frexp.v4f64.v4i32(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call { <5 x double>, <5 x i32> } @llvm.frexp.v5f64.v5i32(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call { <8 x double>, <8 x i32> } @llvm.frexp.v8f64.v8i32(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call { <16 x double>, <16 x i32> } @llvm.frexp.v16f64.v16i32(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call { <17 x double>, <17 x i32> } @llvm.frexp.v17f64.v17i32(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'frexp_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call { double, i32 } @llvm.frexp.f64.i32(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call { <3 x double>, <3 x i32> } @llvm.frexp.v3f64.v3i32(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call { <4 x double>, <4 x i32> } @llvm.frexp.v4f64.v4i32(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call { <5 x double>, <5 x i32> } @llvm.frexp.v5f64.v5i32(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call { <8 x double>, <8 x i32> } @llvm.frexp.v8f64.v8i32(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call { <16 x double>, <16 x i32> } @llvm.frexp.v16f64.v16i32(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call { <17 x double>, <17 x i32> } @llvm.frexp.v17f64.v17i32(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call { double, i32 } @llvm.frexp.f64.i32(double undef)
+  %v2f64 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> undef)
+  %v3f64 = call { <3 x double>, <3 x i32> } @llvm.frexp.v3f64.v3i32(<3 x double> undef)
+  %v4f64 = call { <4 x double>, <4 x i32> } @llvm.frexp.v4f64.v4i32(<4 x double> undef)
+  %v5f64 = call { <5 x double>, <5 x i32> } @llvm.frexp.v5f64.v5i32(<5 x double> undef)
+  %v8f64 = call { <8 x double>, <8 x i32> } @llvm.frexp.v8f64.v8i32(<8 x double> undef)
+  %v16f64 = call { <16 x double>, <16 x i32> } @llvm.frexp.v16f64.v16i32(<16 x double> undef)
+  %v17f64 = call { <17 x double>, <17 x i32> } @llvm.frexp.v17f64.v17i32(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/is_fpclass.ll b/llvm/test/Analysis/CostModel/AMDGPU/is_fpclass.ll
new file mode 100644
index 00000000000000..fc7af38e4729f5
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/is_fpclass.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
+
+define void @is_fpclass_f16() {
+; ALL-LABEL: 'is_fpclass_f16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call i1 @llvm.is.fpclass.f16(half undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x i1> @llvm.is.fpclass.v5f16(<5 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x i1> @llvm.is.fpclass.v8f16(<8 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x i1> @llvm.is.fpclass.v16f16(<16 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x i1> @llvm.is.fpclass.v17f16(<17 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_f16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call i1 @llvm.is.fpclass.f16(half undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x i1> @llvm.is.fpclass.v5f16(<5 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x i1> @llvm.is.fpclass.v8f16(<8 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x i1> @llvm.is.fpclass.v16f16(<16 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x i1> @llvm.is.fpclass.v17f16(<17 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call i1 @llvm.is.fpclass.f16(half undef, i32 0)
+  %v2f16 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> undef, i32 0)
+  %v3f16 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> undef, i32 0)
+  %v4f16 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> undef, i32 0)
+  %v5f16 = call <5 x i1> @llvm.is.fpclass.v5f16(<5 x half> undef, i32 0)
+  %v8f16 = call <8 x i1> @llvm.is.fpclass.v8f16(<8 x half> undef, i32 0)
+  %v16f16 = call <16 x i1> @llvm.is.fpclass.v16f16(<16 x half> undef, i32 0)
+  %v17f16 = call <17 x i1> @llvm.is.fpclass.v17f16(<17 x half> undef, i32 0)
+  ret void
+}
+
+define void @is_fpclass_bf16() {
+; ALL-LABEL: 'is_fpclass_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call i1 @llvm.is.fpclass.bf16(bfloat undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x i1> @llvm.is.fpclass.v5bf16(<5 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x i1> @llvm.is.fpclass.v8bf16(<8 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x i1> @llvm.is.fpclass.v16bf16(<16 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x i1> @llvm.is.fpclass.v17bf16(<17 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_bf16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call i1 @llvm.is.fpclass.bf16(bfloat undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x i1> @llvm.is.fpclass.v5bf16(<5 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x i1> @llvm.is.fpclass.v8bf16(<8 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x i1> @llvm.is.fpclass.v16bf16(<16 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x i1> @llvm.is.fpclass.v17bf16(<17 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call i1 @llvm.is.fpclass.bf16(bfloat undef, i32 0)
+  %v2bf16 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> undef, i32 0)
+  %v3bf16 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> undef, i32 0)
+  %v4bf16 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> undef, i32 0)
+  %v5bf16 = call <5 x i1> @llvm.is.fpclass.v5bf16(<5 x bfloat> undef, i32 0)
+  %v8bf16 = call <8 x i1> @llvm.is.fpclass.v8bf16(<8 x bfloat> undef, i32 0)
+  %v16bf16 = call <16 x i1> @llvm.is.fpclass.v16bf16(<16 x bfloat> undef, i32 0)
+  %v17bf16 = call <17 x i1> @llvm.is.fpclass.v17bf16(<17 x bfloat> undef, i32 0)
+  ret void
+}
+
+define void @is_fpclass_f32() {
+; ALL-LABEL: 'is_fpclass_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call i1 @llvm.is.fpclass.f32(float undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x i1> @llvm.is.fpclass.v17f32(<17 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call i1 @llvm.is.fpclass.f32(float undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x i1> @llvm.is.fpclass.v17f32(<17 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call i1 @llvm.is.fpclass.f32(float undef, i32 0)
+  %v2f32 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> undef, i32 0)
+  %v3f32 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> undef, i32 0)
+  %v4f32 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> undef, i32 0)
+  %v5f32 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> undef, i32 0)
+  %v8f32 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> undef, i32 0)
+  %v16f32 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> undef, i32 0)
+  %v17f32 = call <17 x i1> @llvm.is.fpclass.v17f32(<17 x float> undef, i32 0)
+  ret void
+}
+
+define void @is_fpclass_f64() {
+; ALL-LABEL: 'is_fpclass_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call i1 @llvm.is.fpclass.f64(double undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x i1> @llvm.is.fpclass.v3f64(<3 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x i1> @llvm.is.fpclass.v5f64(<5 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x i1> @llvm.is.fpclass.v8f64(<8 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x i1> @llvm.is.fpclass.v16f64(<16 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x i1> @llvm.is.fpclass.v17f64(<17 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call i1 @llvm.is.fpclass.f64(double undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x i1> @llvm.is.fpclass.v3f64(<3 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x i1> @llvm.is.fpclass.v5f64(<5 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x i1> @llvm.is.fpclass.v8f64(<8 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x i1> @llvm.is.fpclass.v16f64(<16 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x i1> @llvm.is.fpclass.v17f64(<17 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call i1 @llvm.is.fpclass.f64(double undef, i32 0)
+  %v2f64 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> undef, i32 0)
+  %v3f64 = call <3 x i1> @llvm.is.fpclass.v3f64(<3 x double> undef, i32 0)
+  %v4f64 = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> undef, i32 0)
+  %v5f64 = call <5 x i1> @llvm.is.fpclass.v5f64(<5 x double> undef, i32 0)
+  %v8f64 = call <8 x i1> @llvm.is.fpclass.v8f64(<8 x double> undef, i32 0)
+  %v16f64 = call <16 x i1> @llvm.is.fpclass.v16f64(<16 x double> undef, i32 0)
+  %v17f64 = call <17 x i1> @llvm.is.fpclass.v17f64(<17 x double> undef, i32 0)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/ldexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/ldexp.ll
new file mode 100644
index 00000000000000..2b1b5906ad0179
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/ldexp.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL-SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+
+define void @ldexp_f16_i32() {
+; GFX7-LABEL: 'ldexp_f16_i32'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'ldexp_f16_i32'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'ldexp_f16_i32'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'ldexp_f16_i32'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+  %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+  %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+  %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+  %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+  %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+  %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+  %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+  ret void
+}
+
+define void @ldexp_f16_i16() {
+; GFX7-LABEL: 'ldexp_f16_i16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'ldexp_f16_i16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'ldexp_f16_i16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'ldexp_f16_i16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+  %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+  %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+  %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+  %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+  %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+  %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+  %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+  ret void
+}
+
+define void @ldexp_bf16() {
+; GFX7-LABEL: 'ldexp_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'ldexp_bf16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'ldexp_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'ldexp_bf16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+  %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+  ret void
+}
+
+define void @ldexp_f32() {
+; ALL-LABEL: 'ldexp_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.ldexp.f32.i32(float undef, i32 undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> undef, <2 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> undef, <3 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> undef, <4 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.ldexp.v5f32.v5i32(<5 x float> undef, <5 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> undef, <8 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> undef, <16 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call <17 x float> @llvm.ldexp.v17f32.v17i32(<17 x float> undef, <17 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'ldexp_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.ldexp.f32.i32(float undef, i32 undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> undef, <2 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> undef, <3 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> undef, <4 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.ldexp.v5f32.v5i32(<5 x float> undef, <5 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> undef, <8 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> undef, <16 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call <17 x float> @llvm.ldexp.v17f32.v17i32(<17 x float> undef, <17 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.ldexp.f32.i32(float undef, i32 undef)
+  %v2f32 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> undef, <2 x i32> undef)
+  %v3f32 = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> undef, <3 x i32> undef)
+  %v4f32 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> undef, <4 x i32> undef)
+  %v5f32 = call <5 x float> @llvm.ldexp.v5f32.v5i32(<5 x float> undef, <5 x i32> undef)
+  %v8f32 = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> undef, <8 x i32> undef)
+  %v16f32 = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> undef, <16 x i32> undef)
+  %v17f32 = call <17 x float> @llvm.ldexp.v17f32.v17i32(<17 x float> undef, <17 x i32> undef)
+  ret void
+}
+
+define void @ldexp_f64() {
+; ALL-LABEL: 'ldexp_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.ldexp.f64.i32(double undef, i32 undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> undef, <2 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.ldexp.v3f64.v3i32(<3 x double> undef, <3 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> undef, <4 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.ldexp.v5f64.v5i32(<5 x double> undef, <5 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> undef, <8 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.ldexp.v16f64.v16i32(<16 x double> undef, <16 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.ldexp.v17f64.v17i32(<17 x double> undef, <17 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'ldexp_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.ldexp.f64.i32(double undef, i32 undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> undef, <2 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.ldexp.v3f64.v3i32(<3 x double> undef, <3 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> undef, <4 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.ldexp.v5f64.v5i32(<5 x double> undef, <5 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> undef, <8 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.ldexp.v16f64.v16i32(<16 x double> undef, <16 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.ldexp.v17f64.v17i32(<17 x double> undef, <17 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.ldexp.f64.i32(double undef, i32 undef)
+  %v2f64 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> undef, <2 x i32> undef)
+  %v3f64 = call <3 x double> @llvm.ldexp.v3f64.v3i32(<3 x double> undef, <3 x i32> undef)
+  %v4f64 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> undef, <4 x i32> undef)
+  %v5f64 = call <5 x double> @llvm.ldexp.v5f64.v5i32(<5 x double> undef, <5 x i32> undef)
+  %v8f64 = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> undef, <8 x i32> undef)
+  %v16f64 = call <16 x double> @llvm.ldexp.v16f64.v16i32(<16 x double> undef, <16 x i32> undef)
+  %v17f64 = call <17 x double> @llvm.ldexp.v17f64.v17i32(<17 x double> undef, <17 x i32> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/log.ll b/llvm/test/Analysis/CostModel/AMDGPU/log.ll
new file mode 100644
index 00000000000000..2cf7039982f262
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/log.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @log_f16() {
+; BASE-LABEL: 'log_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.log.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @log_bf16() {
+; BASE-LABEL: 'log_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @log_f32() {
+; ALL-LABEL: 'log_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.log.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.log.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.log.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.log.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @log_f64() {
+; ALL-LABEL: 'log_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.log.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.log.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.log.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.log.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.log.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.log.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.log.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.log.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.log.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.log.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.log.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.log.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/log10.ll b/llvm/test/Analysis/CostModel/AMDGPU/log10.ll
new file mode 100644
index 00000000000000..d807c6cd63648e
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/log10.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @log10_f16() {
+; BASE-LABEL: 'log10_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log10_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log10_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log10_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log10_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log10_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log10_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log10_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.log10.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @log10_bf16() {
+; BASE-LABEL: 'log10_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log10_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log10_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log10_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log10_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log10_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log10_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log10_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @log10_f32() {
+; ALL-LABEL: 'log10_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log10.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log10.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log10.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log10.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log10_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log10.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log10.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log10.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log10.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.log10.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.log10.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.log10.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.log10.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @log10_f64() {
+; ALL-LABEL: 'log10_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.log10.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.log10.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.log10.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.log10.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log10_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.log10.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.log10.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.log10.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.log10.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.log10.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.log10.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.log10.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.log10.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/log2.ll b/llvm/test/Analysis/CostModel/AMDGPU/log2.ll
new file mode 100644
index 00000000000000..1ef3977fe4a8f1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/log2.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @log2_f16() {
+; BASE-LABEL: 'log2_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log2_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log2_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log2_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log2_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log2_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log2_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log2_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.log2.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @log2_bf16() {
+; BASE-LABEL: 'log2_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log2_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log2_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log2_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log2_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log2_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log2_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log2_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @log2_f32() {
+; ALL-LABEL: 'log2_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log2.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log2.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log2.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log2.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log2_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log2.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log2.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log2.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log2.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.log2.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.log2.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.log2.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.log2.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @log2_f64() {
+; ALL-LABEL: 'log2_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.log2.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.log2.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.log2.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.log2.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log2_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.log2.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.log2.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.log2.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.log2.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.log2.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.log2.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.log2.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.log2.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
new file mode 100644
index 00000000000000..0f6f01b9c1d361
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @maximum_f16() {
+; GFX9-LABEL: 'maximum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maximum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maximum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maximum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.maximum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @maximum_bf16() {
+; GFX9-LABEL: 'maximum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maximum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maximum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maximum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @maximum_f32() {
+; ALL-LABEL: 'maximum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maximum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @maximum_f64() {
+; ALL-LABEL: 'maximum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.maximum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.maximum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.maximum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.maximum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maximum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.maximum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maxnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maxnum.ll
new file mode 100644
index 00000000000000..0d423fec7adcce
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maxnum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @maxnum_f16() {
+; GFX9-LABEL: 'maxnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maxnum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maxnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maxnum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maxnum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @maxnum_bf16() {
+; GFX9-LABEL: 'maxnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maxnum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maxnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maxnum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maxnum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @maxnum_f32() {
+; ALL-LABEL: 'maxnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maxnum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maxnum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maxnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maxnum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maxnum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maxnum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.maxnum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @maxnum_f64() {
+; ALL-LABEL: 'maxnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maxnum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maxnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maxnum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.maxnum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
new file mode 100644
index 00000000000000..b6e52cfaaaadab
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @minimum_f16() {
+; GFX9-LABEL: 'minimum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minimum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minimum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minimum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.minimum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @minimum_bf16() {
+; GFX9-LABEL: 'minimum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minimum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minimum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minimum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @minimum_f32() {
+; ALL-LABEL: 'minimum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.minimum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.minimum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.minimum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minimum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.minimum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.minimum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @minimum_f64() {
+; ALL-LABEL: 'minimum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.minimum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.minimum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.minimum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.minimum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minimum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.minimum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minnum.ll
new file mode 100644
index 00000000000000..61432ba4409202
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minnum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @minnum_f16() {
+; GFX9-LABEL: 'minnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minnum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minnum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minnum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.minnum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @minnum_bf16() {
+; GFX9-LABEL: 'minnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minnum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minnum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minnum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @minnum_f32() {
+; ALL-LABEL: 'minnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minnum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minnum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minnum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minnum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minnum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.minnum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @minnum_f64() {
+; ALL-LABEL: 'minnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minnum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minnum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minnum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minnum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minnum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.minnum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/ptrmask.ll b/llvm/test/Analysis/CostModel/AMDGPU/ptrmask.ll
new file mode 100644
index 00000000000000..8600dd70da6206
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/ptrmask.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
+; END.
+
+define ptr @ptrmask_p0_i64(ptr %ptr, i64 %mask) {
+; ALL-LABEL: 'ptrmask_p0_i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p0_i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr %result
+;
+  %result = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
+  ret ptr %result
+}
+
+define <2 x ptr> @ptrmask_v2p0_v2i64(<2 x ptr> %ptr, <2 x i64> %mask) {
+; ALL-LABEL: 'ptrmask_v2p0_v2i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <2 x ptr> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v2p0_v2i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x ptr> %result
+;
+  %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> %mask)
+  ret <2 x ptr> %result
+}
+
+define ptr addrspace(1) @ptrmask_p1_i64(ptr addrspace(1) %ptr, i64 %mask) {
+; ALL-LABEL: 'ptrmask_p1_i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(1) %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p1_i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(1) %result
+;
+  %result = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+  ret ptr addrspace(1) %result
+}
+
+define ptr addrspace(5) @ptrmask_p5_i32(ptr addrspace(5) %ptr, i32 %mask) {
+; ALL-LABEL: 'ptrmask_p5_i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5) %ptr, i32 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(5) %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p5_i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5) %ptr, i32 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(5) %result
+;
+  %result = call ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5) %ptr, i32 %mask)
+  ret ptr addrspace(5) %result
+}
+
+define ptr addrspace(3) @ptrmask_p3_i32(ptr addrspace(3) %ptr, i32 %mask) {
+; ALL-LABEL: 'ptrmask_p3_i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(3) %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p3_i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(3) %result
+;
+  %result = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+  ret ptr addrspace(3) %result
+}
+
+define <2 x ptr addrspace(5)> @ptrmask_v2p5_v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask) {
+; ALL-LABEL: 'ptrmask_v2p5_v2i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr addrspace(5)> @llvm.ptrmask.v2p5.v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <2 x ptr addrspace(5)> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v2p5_v2i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr addrspace(5)> @llvm.ptrmask.v2p5.v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x ptr addrspace(5)> %result
+;
+  %result = call <2 x ptr addrspace(5)> @llvm.ptrmask.v2p5.v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask)
+  ret <2 x ptr addrspace(5)> %result
+}
+
+define <3 x ptr> @ptrmask_v3p0_v3i64(<3 x ptr> %ptr, <3 x i64> %mask) {
+; ALL-LABEL: 'ptrmask_v3p0_v3i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %ptr, <3 x i64> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <3 x ptr> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v3p0_v3i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %ptr, <3 x i64> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <3 x ptr> %result
+;
+  %result = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %ptr, <3 x i64> %mask)
+  ret <3 x ptr> %result
+}
+
+define <3 x ptr addrspace(5)> @ptrmask_v3p5_v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask) {
+; ALL-LABEL: 'ptrmask_v3p5_v3i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr addrspace(5)> @llvm.ptrmask.v3p5.v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <3 x ptr addrspace(5)> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v3p5_v3i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr addrspace(5)> @llvm.ptrmask.v3p5.v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <3 x ptr addrspace(5)> %result
+;
+  %result = call <3 x ptr addrspace(5)> @llvm.ptrmask.v3p5.v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask)
+  ret <3 x ptr addrspace(5)> %result
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/sqrt.ll b/llvm/test/Analysis/CostModel/AMDGPU/sqrt.ll
new file mode 100644
index 00000000000000..7136b70a041906
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/sqrt.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @sqrt_f16() {
+; BASE-LABEL: 'sqrt_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'sqrt_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'sqrt_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'sqrt_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'sqrt_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'sqrt_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'sqrt_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'sqrt_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.sqrt.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @sqrt_bf16() {
+; BASE-LABEL: 'sqrt_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'sqrt_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'sqrt_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'sqrt_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'sqrt_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'sqrt_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'sqrt_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'sqrt_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @sqrt_f32() {
+; ALL-LABEL: 'sqrt_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.sqrt.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.sqrt.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.sqrt.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.sqrt.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'sqrt_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.sqrt.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.sqrt.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.sqrt.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.sqrt.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.sqrt.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.sqrt.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.sqrt.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.sqrt.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @sqrt_f64() {
+; ALL-LABEL: 'sqrt_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.sqrt.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.sqrt.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x double> @llvm.sqrt.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x double> @llvm.sqrt.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'sqrt_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.sqrt.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.sqrt.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x double> @llvm.sqrt.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x double> @llvm.sqrt.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.sqrt.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.sqrt.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.sqrt.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.sqrt.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 7cb2452e1a148d..61be5ee458e9d2 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -2879,8 +2879,8 @@ entry:
   ret ptr %5
 }
 
-define i64 @srli_slliw(i64 %1) {
-; RV64I-LABEL: srli_slliw:
+define i64 @srli_slliuw(i64 %1) {
+; RV64I-LABEL: srli_slliuw:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    li a1, 1
@@ -2889,7 +2889,7 @@ define i64 @srli_slliw(i64 %1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64ZBA-LABEL: srli_slliw:
+; RV64ZBA-LABEL: srli_slliuw:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    srli a0, a0, 2
 ; RV64ZBA-NEXT:    slli.uw a0, a0, 4
@@ -2901,8 +2901,8 @@ entry:
   ret i64 %4
 }
 
-define i64 @srli_slliw_canonical(i64 %0) {
-; RV64I-LABEL: srli_slliw_canonical:
+define i64 @srli_slliuw_canonical(i64 %0) {
+; RV64I-LABEL: srli_slliuw_canonical:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    li a1, 1
@@ -2911,7 +2911,7 @@ define i64 @srli_slliw_canonical(i64 %0) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64ZBA-LABEL: srli_slliw_canonical:
+; RV64ZBA-LABEL: srli_slliuw_canonical:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    srli a0, a0, 2
 ; RV64ZBA-NEXT:    slli.uw a0, a0, 4
@@ -2949,3 +2949,48 @@ entry:
   %4 = shl i64 %3, 4
   ret i64 %4
 }
+
+define i64 @srli_slliuw_2(i64 %1) {
+; RV64I-LABEL: srli_slliuw_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srli a0, a0, 15
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    addi a1, a1, -8
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_slliuw_2:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    srli a0, a0, 15
+; RV64ZBA-NEXT:    srli a0, a0, 3
+; RV64ZBA-NEXT:    slli.uw a0, a0, 3
+; RV64ZBA-NEXT:    ret
+entry:
+  %2 = lshr i64 %1, 18
+  %3 = and i64 %2, 4294967295
+  %4 = shl i64 %3, 3
+  ret i64 %4
+}
+
+define i64 @srli_slliuw_canonical_2(i64 %0) {
+; RV64I-LABEL: srli_slliuw_canonical_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srli a0, a0, 15
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    addi a1, a1, -8
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_slliuw_canonical_2:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    srli a0, a0, 15
+; RV64ZBA-NEXT:    srli a0, a0, 3
+; RV64ZBA-NEXT:    slli.uw a0, a0, 3
+; RV64ZBA-NEXT:    ret
+entry:
+  %1 = lshr i64 %0, 15
+  %2 = and i64 %1, 34359738360
+  ret i64 %2
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
index c99388cbdaf441..93fe66695b70ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
+++ b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
@@ -27,8 +27,12 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; OMIT-FP-NEXT:    addi a0, sp, 16
 ; OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
 ; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x08, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 2 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 4 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 8 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 4 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x08, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 3 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 8 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x08, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 7 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x08, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 6 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x08, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 5 * vlenb
 ; OMIT-FP-NEXT:    #APP
 ; OMIT-FP-NEXT:    #NO_APP
 ; OMIT-FP-NEXT:    csrr a0, vlenb
@@ -79,8 +83,12 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; NO-OMIT-FP-NEXT:    addi a0, a0, -32
 ; NO-OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
 ; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 32 - 2 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 32 - 4 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 32 - 8 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 32 - 4 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 32 - 3 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 32 - 8 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 32 - 7 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 32 - 6 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 32 - 5 * vlenb
 ; NO-OMIT-FP-NEXT:    #APP
 ; NO-OMIT-FP-NEXT:    #NO_APP
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll b/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll
index c7bf71bb06c5b8..62cf06d7ee3c84 100644
--- a/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll
+++ b/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll
@@ -1,128 +1,459 @@
-; RUN: llc -march=sparc <%s | FileCheck %s -check-prefix=V8
-; RUN: llc -march=sparc -mattr=v9 <%s | FileCheck %s -check-prefix=V9
-; RUN: llc -mtriple=sparc64-unknown-linux <%s | FileCheck %s -check-prefix=SPARC64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=sparc %s -o - | FileCheck %s -check-prefix=V8
+; RUN: llc -march=sparc -mattr=v9 %s -o - | FileCheck %s -check-prefix=V9
+; RUN: llc -mtriple=sparc64-unknown-linux %s -o - | FileCheck %s -check-prefix=SPARC64
 
-
-define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind readnone noinline {
+define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind {
+; V8-LABEL: test_addx:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    addcc %o1, %o3, %o3
+; V8-NEXT:    addxcc %o0, %o2, %o1
+; V8-NEXT:    mov 1, %o0
+; V8-NEXT:    cmp %o1, %o4
+; V8-NEXT:    bleu .LBB0_4
+; V8-NEXT:    mov %o0, %o2
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    cmp %o3, %o5
+; V8-NEXT:    bleu .LBB0_5
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_2: ! %entry
+; V8-NEXT:    cmp %o1, %o4
+; V8-NEXT:    bne .LBB0_6
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_3: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_4: ! %entry
+; V8-NEXT:    mov %g0, %o2
+; V8-NEXT:    cmp %o3, %o5
+; V8-NEXT:    bgu .LBB0_2
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_5: ! %entry
+; V8-NEXT:    mov %g0, %o0
+; V8-NEXT:    cmp %o1, %o4
+; V8-NEXT:    be .LBB0_3
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_6: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    mov %o2, %o0
+;
+; V9-LABEL: test_addx:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    mov %g0, %g2
+; V9-NEXT:    mov %g0, %g3
+; V9-NEXT:    addcc %o1, %o3, %o1
+; V9-NEXT:    addxcc %o0, %o2, %o0
+; V9-NEXT:    cmp %o0, %o4
+; V9-NEXT:    movgu %icc, 1, %g2
+; V9-NEXT:    cmp %o1, %o5
+; V9-NEXT:    movgu %icc, 1, %g3
+; V9-NEXT:    cmp %o0, %o4
+; V9-NEXT:    move %icc, %g3, %g2
+; V9-NEXT:    retl
+; V9-NEXT:    mov %g2, %o0
+;
+; SPARC64-LABEL: test_addx:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    mov %g0, %o3
+; SPARC64-NEXT:    add %o0, %o1, %o0
+; SPARC64-NEXT:    cmp %o0, %o2
+; SPARC64-NEXT:    movgu %xcc, 1, %o3
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    srl %o3, 0, %o0
 entry:
-; V8: addcc
-; V8-NOT: subcc
-; V8: addx
-; V9: addcc
-; V9-NOT: subcc
-; V9: addx
-; V9: mov{{e|ne}} %icc
   %0 = add i64 %a, %b
   %1 = icmp ugt i64 %0, %c
   %2 = zext i1 %1 to i32
   ret i32 %2
 }
 
-
-define i32 @test_select_int_icc(i32 %a, i32 %b, i32 %c) nounwind readnone noinline {
+define i32 @test_select_int_icc(i32 %a, i32 %b, i32 %c) nounwind {
+; V8-LABEL: test_select_int_icc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    cmp %o0, 0
+; V8-NEXT:    be .LBB1_2
+; V8-NEXT:    mov %o1, %o0
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    mov %o2, %o0
+; V8-NEXT:  .LBB1_2: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    nop
+;
+; V9-LABEL: test_select_int_icc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    cmp %o0, 0
+; V9-NEXT:    move %icc, %o1, %o2
+; V9-NEXT:    retl
+; V9-NEXT:    mov %o2, %o0
+;
+; SPARC64-LABEL: test_select_int_icc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    move %icc, %o1, %o2
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    mov %o2, %o0
 entry:
-; V8: test_select_int_icc
-; V8: cmp
-; V8: {{be|bne}}
-; V9: test_select_int_icc
-; V9: cmp
-; V9-NOT: {{be|bne}}
-; V9: mov{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, i32 %b, i32 %c
   ret i32 %1
 }
 
-
-define float @test_select_fp_icc(i32 %a, float %f1, float %f2) nounwind readnone noinline {
+define float @test_select_fp_icc(i32 %a, float %f1, float %f2) nounwind {
+; V8-LABEL: test_select_fp_icc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -104, %sp
+; V8-NEXT:    st %o2, [%sp+100]
+; V8-NEXT:    cmp %o0, 0
+; V8-NEXT:    be .LBB2_2
+; V8-NEXT:    st %o1, [%sp+96]
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ld [%sp+100], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+; V8-NEXT:  .LBB2_2:
+; V8-NEXT:    ld [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+;
+; V9-LABEL: test_select_fp_icc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -104, %sp
+; V9-NEXT:    st %o2, [%sp+100]
+; V9-NEXT:    st %o1, [%sp+96]
+; V9-NEXT:    ld [%sp+100], %f0
+; V9-NEXT:    ld [%sp+96], %f1
+; V9-NEXT:    cmp %o0, 0
+; V9-NEXT:    fmovse %icc, %f1, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 104, %sp
+;
+; SPARC64-LABEL: test_select_fp_icc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    fmovs %f5, %f0
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    fmovse %icc, %f3, %f0
 entry:
-; V8: test_select_fp_icc
-; V8: cmp
-; V8: {{be|bne}}
-; V9: test_select_fp_icc
-; V9: cmp
-; V9-NOT: {{be|bne}}
-; V9: fmovs{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, float %f1, float %f2
   ret float %1
 }
 
-define double @test_select_dfp_icc(i32 %a, double %f1, double %f2) nounwind readnone noinline {
+define double @test_select_dfp_icc(i32 %a, double %f1, double %f2) nounwind {
+; V8-LABEL: test_select_dfp_icc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -112, %sp
+; V8-NEXT:    mov %o4, %o5
+; V8-NEXT:    mov %o2, %g3
+; V8-NEXT:    mov %o3, %o4
+; V8-NEXT:    std %o4, [%sp+96]
+; V8-NEXT:    cmp %o0, 0
+; V8-NEXT:    mov %o1, %g2
+; V8-NEXT:    be .LBB3_2
+; V8-NEXT:    std %g2, [%sp+104]
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ldd [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+; V8-NEXT:  .LBB3_2:
+; V8-NEXT:    ldd [%sp+104], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+;
+; V9-LABEL: test_select_dfp_icc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -112, %sp
+; V9-NEXT:    mov %o4, %o5
+; V9-NEXT:    mov %o2, %g3
+; V9-NEXT:    mov %o3, %o4
+; V9-NEXT:    std %o4, [%sp+96]
+; V9-NEXT:    mov %o1, %g2
+; V9-NEXT:    std %g2, [%sp+104]
+; V9-NEXT:    ldd [%sp+96], %f0
+; V9-NEXT:    ldd [%sp+104], %f2
+; V9-NEXT:    cmp %o0, 0
+; V9-NEXT:    fmovde %icc, %f2, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 112, %sp
+;
+; SPARC64-LABEL: test_select_dfp_icc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    fmovd %f4, %f0
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    fmovde %icc, %f2, %f0
 entry:
-; V8: test_select_dfp_icc
-; V8: cmp
-; V8: {{be|bne}}
-; V9: test_select_dfp_icc
-; V9: cmp
-; V9-NOT: {{be|bne}}
-; V9: fmovd{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, double %f1, double %f2
   ret double %1
 }
 
-define i32 @test_select_int_fcc(float %f, i32 %a, i32 %b) nounwind readnone noinline {
+define i32 @test_select_int_fcc(float %f, i32 %a, i32 %b) nounwind {
+; V8-LABEL: test_select_int_fcc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -96, %sp
+; V8-NEXT:    st %o0, [%sp+92]
+; V8-NEXT:    ld [%sp+92], %f0
+; V8-NEXT:    sethi %hi(.LCPI4_0), %o0
+; V8-NEXT:    ld [%o0+%lo(.LCPI4_0)], %f1
+; V8-NEXT:    fcmps %f0, %f1
+; V8-NEXT:    nop
+; V8-NEXT:    fbne .LBB4_2
+; V8-NEXT:    mov %o1, %o0
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    mov %o2, %o0
+; V8-NEXT:  .LBB4_2: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 96, %sp
+;
+; V9-LABEL: test_select_int_fcc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -96, %sp
+; V9-NEXT:    st %o0, [%sp+92]
+; V9-NEXT:    ld [%sp+92], %f0
+; V9-NEXT:    sethi %hi(.LCPI4_0), %o0
+; V9-NEXT:    ld [%o0+%lo(.LCPI4_0)], %f1
+; V9-NEXT:    mov %o2, %o0
+; V9-NEXT:    fcmps %fcc0, %f0, %f1
+; V9-NEXT:    movne %fcc0, %o1, %o0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 96, %sp
+;
+; SPARC64-LABEL: test_select_int_fcc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI4_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI4_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ld [%o0+%l44(.LCPI4_0)], %f0
+; SPARC64-NEXT:    mov %o2, %o0
+; SPARC64-NEXT:    fcmps %fcc0, %f1, %f0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    movne %fcc0, %o1, %o0
 entry:
-;V8-LABEL: test_select_int_fcc:
-;V8: fcmps
-;V8-NEXT: nop
-;V8: {{fbe|fbne}}
-;V9-LABEL: test_select_int_fcc:
-;V9: fcmps
-;V9-NOT: nop
-;V9-NOT: {{fbe|fbne}}
-;V9: mov{{e|ne}} %fcc0
   %0 = fcmp une float %f, 0.000000e+00
   %a.b = select i1 %0, i32 %a, i32 %b
   ret i32 %a.b
 }
 
-
-define float @test_select_fp_fcc(float %f, float %f1, float %f2) nounwind readnone noinline {
+define float @test_select_fp_fcc(float %f, float %f1, float %f2) nounwind {
+; V8-LABEL: test_select_fp_fcc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -104, %sp
+; V8-NEXT:    st %o0, [%sp+92]
+; V8-NEXT:    st %o2, [%sp+100]
+; V8-NEXT:    st %o1, [%sp+96]
+; V8-NEXT:    ld [%sp+92], %f0
+; V8-NEXT:    sethi %hi(.LCPI5_0), %o0
+; V8-NEXT:    ld [%o0+%lo(.LCPI5_0)], %f1
+; V8-NEXT:    fcmps %f0, %f1
+; V8-NEXT:    nop
+; V8-NEXT:    fbne .LBB5_2
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ld [%sp+100], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+; V8-NEXT:  .LBB5_2:
+; V8-NEXT:    ld [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+;
+; V9-LABEL: test_select_fp_fcc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -104, %sp
+; V9-NEXT:    st %o0, [%sp+92]
+; V9-NEXT:    st %o2, [%sp+100]
+; V9-NEXT:    st %o1, [%sp+96]
+; V9-NEXT:    ld [%sp+92], %f1
+; V9-NEXT:    ld [%sp+100], %f0
+; V9-NEXT:    sethi %hi(.LCPI5_0), %o0
+; V9-NEXT:    ld [%o0+%lo(.LCPI5_0)], %f2
+; V9-NEXT:    ld [%sp+96], %f3
+; V9-NEXT:    fcmps %fcc0, %f1, %f2
+; V9-NEXT:    fmovsne %fcc0, %f3, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 104, %sp
+;
+; SPARC64-LABEL: test_select_fp_fcc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI5_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI5_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ld [%o0+%l44(.LCPI5_0)], %f2
+; SPARC64-NEXT:    fmovs %f5, %f0
+; SPARC64-NEXT:    fcmps %fcc0, %f1, %f2
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    fmovsne %fcc0, %f3, %f0
 entry:
-;V8-LABEL: test_select_fp_fcc:
-;V8: fcmps
-;V8: {{fbe|fbne}}
-;V9-LABEL: test_select_fp_fcc:
-;V9: fcmps
-;V9-NOT: {{fbe|fbne}}
-;V9: fmovs{{e|ne}} %fcc0
   %0 = fcmp une float %f, 0.000000e+00
   %1 = select i1 %0, float %f1, float %f2
   ret float %1
 }
 
-define double @test_select_dfp_fcc(double %f, double %f1, double %f2) nounwind readnone noinline {
+define double @test_select_dfp_fcc(double %f, double %f1, double %f2) nounwind {
+; V8-LABEL: test_select_dfp_fcc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -120, %sp
+; V8-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V8-NEXT:    ! kill: def $o5 killed $o5 killed $o4_o5 def $o4_o5
+; V8-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V8-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V8-NEXT:    std %o0, [%sp+112]
+; V8-NEXT:    ! kill: def $o4 killed $o4 killed $o4_o5 def $o4_o5
+; V8-NEXT:    std %o4, [%sp+96]
+; V8-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V8-NEXT:    std %o2, [%sp+104]
+; V8-NEXT:    ldd [%sp+112], %f0
+; V8-NEXT:    sethi %hi(.LCPI6_0), %o0
+; V8-NEXT:    ldd [%o0+%lo(.LCPI6_0)], %f2
+; V8-NEXT:    fcmpd %f0, %f2
+; V8-NEXT:    nop
+; V8-NEXT:    fbne .LBB6_2
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ldd [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 120, %sp
+; V8-NEXT:  .LBB6_2:
+; V8-NEXT:    ldd [%sp+104], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 120, %sp
+;
+; V9-LABEL: test_select_dfp_fcc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -120, %sp
+; V9-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V9-NEXT:    ! kill: def $o5 killed $o5 killed $o4_o5 def $o4_o5
+; V9-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V9-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V9-NEXT:    std %o0, [%sp+112]
+; V9-NEXT:    ! kill: def $o4 killed $o4 killed $o4_o5 def $o4_o5
+; V9-NEXT:    std %o4, [%sp+96]
+; V9-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V9-NEXT:    std %o2, [%sp+104]
+; V9-NEXT:    ldd [%sp+112], %f2
+; V9-NEXT:    ldd [%sp+96], %f0
+; V9-NEXT:    sethi %hi(.LCPI6_0), %o0
+; V9-NEXT:    ldd [%o0+%lo(.LCPI6_0)], %f4
+; V9-NEXT:    ldd [%sp+104], %f6
+; V9-NEXT:    fcmpd %fcc0, %f2, %f4
+; V9-NEXT:    fmovdne %fcc0, %f6, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 120, %sp
+;
+; SPARC64-LABEL: test_select_dfp_fcc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI6_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI6_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ldd [%o0+%l44(.LCPI6_0)], %f6
+; SPARC64-NEXT:    fcmpd %fcc0, %f0, %f6
+; SPARC64-NEXT:    fmovdne %fcc0, %f2, %f4
+; SPARC64-NEXT:    fmovd %f4, %f0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    nop
 entry:
-;V8-LABEL: test_select_dfp_fcc:
-;V8: fcmpd
-;V8-NEXT: nop
-;V8: {{fbne|fbe}}
-;V9-LABEL: test_select_dfp_fcc:
-;V9: fcmpd
-;V9-NOT: nop
-;V9-NOT: {{fbne|fbe}}
-;V9: fmovd{{e|ne}} %fcc0
   %0 = fcmp une double %f, 0.000000e+00
   %1 = select i1 %0, double %f1, double %f2
   ret double %1
 }
 
-define i32 @test_float_cc(double %a, double %b, i32 %c, i32 %d) {
+define i32 @test_float_cc(double %a, double %b, i32 %c, i32 %d) nounwind {
+; V8-LABEL: test_float_cc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -112, %sp
+; V8-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V8-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V8-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V8-NEXT:    std %o2, [%sp+96]
+; V8-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V8-NEXT:    std %o0, [%sp+104]
+; V8-NEXT:    ldd [%sp+104], %f2
+; V8-NEXT:    sethi %hi(.LCPI7_0), %o0
+; V8-NEXT:    ldd [%o0+%lo(.LCPI7_0)], %f0
+; V8-NEXT:    fcmpd %f2, %f0
+; V8-NEXT:    nop
+; V8-NEXT:    fbuge .LBB7_3
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.1: ! %loop.2
+; V8-NEXT:    ldd [%sp+96], %f2
+; V8-NEXT:    fcmpd %f2, %f0
+; V8-NEXT:    nop
+; V8-NEXT:    fbule .LBB7_3
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.2: ! %exit.1
+; V8-NEXT:    mov 1, %o0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+; V8-NEXT:  .LBB7_3: ! %loop
+; V8-NEXT:    ! =>This Inner Loop Header: Depth=1
+; V8-NEXT:    cmp %o4, 10
+; V8-NEXT:    be .LBB7_3
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.4: ! %exit.0
+; V8-NEXT:    mov %g0, %o0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+;
+; V9-LABEL: test_float_cc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -112, %sp
+; V9-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V9-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V9-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V9-NEXT:    std %o2, [%sp+96]
+; V9-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V9-NEXT:    std %o0, [%sp+104]
+; V9-NEXT:    ldd [%sp+104], %f2
+; V9-NEXT:    sethi %hi(.LCPI7_0), %o0
+; V9-NEXT:    ldd [%o0+%lo(.LCPI7_0)], %f0
+; V9-NEXT:    fcmpd %fcc0, %f2, %f0
+; V9-NEXT:    fbuge %fcc0, .LBB7_3
+; V9-NEXT:    nop
+; V9-NEXT:  ! %bb.1: ! %loop.2
+; V9-NEXT:    ldd [%sp+96], %f2
+; V9-NEXT:    fcmpd %fcc0, %f2, %f0
+; V9-NEXT:    fbule %fcc0, .LBB7_3
+; V9-NEXT:    nop
+; V9-NEXT:  ! %bb.2: ! %exit.1
+; V9-NEXT:    mov 1, %o0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 112, %sp
+; V9-NEXT:  .LBB7_3: ! %loop
+; V9-NEXT:    ! =>This Inner Loop Header: Depth=1
+; V9-NEXT:    cmp %o4, 10
+; V9-NEXT:    be %icc, .LBB7_3
+; V9-NEXT:    nop
+; V9-NEXT:  ! %bb.4: ! %exit.0
+; V9-NEXT:    mov %g0, %o0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 112, %sp
+;
+; SPARC64-LABEL: test_float_cc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI7_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI7_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ldd [%o0+%l44(.LCPI7_0)], %f4
+; SPARC64-NEXT:    fcmpd %fcc0, %f0, %f4
+; SPARC64-NEXT:    fbuge %fcc0, .LBB7_3
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.1: ! %loop.2
+; SPARC64-NEXT:    fcmpd %fcc0, %f2, %f4
+; SPARC64-NEXT:    fbule %fcc0, .LBB7_3
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.2: ! %exit.1
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    mov 1, %o0
+; SPARC64-NEXT:  .LBB7_3: ! %loop
+; SPARC64-NEXT:    ! =>This Inner Loop Header: Depth=1
+; SPARC64-NEXT:    cmp %o2, 10
+; SPARC64-NEXT:    be %icc, .LBB7_3
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.4: ! %exit.0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    mov %g0, %o0
 entry:
-; V8-LABEL: test_float_cc
-; V8:       fcmpd
-; V8:       {{fbl|fbuge}} .LBB
-; V8:       fcmpd
-; V8:       {{fbule|fbg}} .LBB
-
-; V9-LABEL: test_float_cc
-; V9:       fcmpd %fcc0
-; V9:       {{fbl|fbuge}} %fcc0, .LBB
-; V9:       fcmpd %fcc0
-; V9:       {{fbule|fbg}} %fcc0, .LBB
-
    %0 = fcmp uge double %a, 0.000000e+00
    br i1 %0, label %loop, label %loop.2
 
@@ -141,39 +472,92 @@ exit.1:
    ret i32 1
 }
 
-; V8-LABEL: test_adde_sube
-; V8:       addcc
-; V8:       addxcc
-; V8:       addxcc
-; V8:       addxcc
-; V8:       subcc
-; V8:       subxcc
-; V8:       subxcc
-; V8:       subxcc
-
-
-; V9-LABEL: test_adde_sube
-; V9:       addcc
-; V9:       addxcc
-; V9:       addxcc
-; V9:       addxcc
-; V9:       subcc
-; V9:       subxcc
-; V9:       subxcc
-; V9:       subxcc
-
-; SPARC64-LABEL: test_adde_sube
-; SPARC64:       addcc
-; SPARC64:       addxcc
-; SPARC64:       addxcc
-; SPARC64:       addxcc
-; SPARC64:       subcc
-; SPARC64:       subxcc
-; SPARC64:       subxcc
-; SPARC64:       subxcc
-
-
-define void @test_adde_sube(ptr %a, ptr %b, ptr %sum, ptr %diff) {
+define void @test_adde_sube(ptr %a, ptr %b, ptr %sum, ptr %diff) nounwind {
+; V8-LABEL: test_adde_sube:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    save %sp, -96, %sp
+; V8-NEXT:    ldd [%i0+8], %i4
+; V8-NEXT:    ldd [%i1+8], %l0
+; V8-NEXT:    ldd [%i0], %g2
+; V8-NEXT:    ldd [%i1], %l2
+; V8-NEXT:    addcc %i5, %l1, %l5
+; V8-NEXT:    addxcc %i4, %l0, %l4
+; V8-NEXT:    addxcc %g3, %l3, %l1
+; V8-NEXT:    addxcc %g2, %l2, %l0
+; V8-NEXT:    std %l4, [%i2+8]
+; V8-NEXT:    std %l0, [%i2]
+; V8-NEXT:    !APP
+; V8-NEXT:    !NO_APP
+; V8-NEXT:    ldd [%i0+8], %l0
+; V8-NEXT:    ldd [%i0], %i0
+; V8-NEXT:    subcc %i5, %l1, %l3
+; V8-NEXT:    subxcc %i4, %l0, %l2
+; V8-NEXT:    subxcc %g3, %i1, %i5
+; V8-NEXT:    subxcc %g2, %i0, %i4
+; V8-NEXT:    std %l2, [%i3+8]
+; V8-NEXT:    std %i4, [%i3]
+; V8-NEXT:    ret
+; V8-NEXT:    restore
+;
+; V9-LABEL: test_adde_sube:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    save %sp, -96, %sp
+; V9-NEXT:    ldd [%i0+8], %i4
+; V9-NEXT:    ldd [%i1+8], %l0
+; V9-NEXT:    ldd [%i0], %g2
+; V9-NEXT:    ldd [%i1], %l2
+; V9-NEXT:    addcc %i5, %l1, %l5
+; V9-NEXT:    addxcc %i4, %l0, %l4
+; V9-NEXT:    addxcc %g3, %l3, %l1
+; V9-NEXT:    addxcc %g2, %l2, %l0
+; V9-NEXT:    std %l4, [%i2+8]
+; V9-NEXT:    std %l0, [%i2]
+; V9-NEXT:    !APP
+; V9-NEXT:    !NO_APP
+; V9-NEXT:    ldd [%i0+8], %l0
+; V9-NEXT:    ldd [%i0], %i0
+; V9-NEXT:    subcc %i5, %l1, %l3
+; V9-NEXT:    subxcc %i4, %l0, %l2
+; V9-NEXT:    subxcc %g3, %i1, %i5
+; V9-NEXT:    subxcc %g2, %i0, %i4
+; V9-NEXT:    std %l2, [%i3+8]
+; V9-NEXT:    std %i4, [%i3]
+; V9-NEXT:    ret
+; V9-NEXT:    restore
+;
+; SPARC64-LABEL: test_adde_sube:
+; SPARC64:         .register %g2, #scratch
+; SPARC64-NEXT:    .register %g3, #scratch
+; SPARC64-NEXT:  ! %bb.0: ! %entry
+; SPARC64-NEXT:    save %sp, -128, %sp
+; SPARC64-NEXT:    ldx [%i0+8], %i4
+; SPARC64-NEXT:    ldx [%i0], %i5
+; SPARC64-NEXT:    ldx [%i1], %g2
+; SPARC64-NEXT:    ldx [%i1+8], %i1
+; SPARC64-NEXT:    mov %g0, %g3
+; SPARC64-NEXT:    add %i5, %g2, %g2
+; SPARC64-NEXT:    add %i4, %i1, %i1
+; SPARC64-NEXT:    cmp %i1, %i4
+; SPARC64-NEXT:    movcs %xcc, 1, %g3
+; SPARC64-NEXT:    srl %g3, 0, %g3
+; SPARC64-NEXT:    add %g2, %g3, %g2
+; SPARC64-NEXT:    stx %i1, [%i2+8]
+; SPARC64-NEXT:    stx %g2, [%i2]
+; SPARC64-NEXT:    !APP
+; SPARC64-NEXT:    !NO_APP
+; SPARC64-NEXT:    ldx [%i0+8], %i1
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    ldx [%i0], %i0
+; SPARC64-NEXT:    cmp %i4, %i1
+; SPARC64-NEXT:    movcs %xcc, 1, %i2
+; SPARC64-NEXT:    srl %i2, 0, %i2
+; SPARC64-NEXT:    sub %i5, %i0, %i0
+; SPARC64-NEXT:    sub %i0, %i2, %i0
+; SPARC64-NEXT:    sub %i4, %i1, %i1
+; SPARC64-NEXT:    stx %i1, [%i3+8]
+; SPARC64-NEXT:    stx %i0, [%i3]
+; SPARC64-NEXT:    ret
+; SPARC64-NEXT:    restore
 entry:
    %0 = bitcast ptr %a to ptr
    %1 = bitcast ptr %b to ptr
diff --git a/llvm/test/CodeGen/SPARC/64cond.ll b/llvm/test/CodeGen/SPARC/64cond.ll
index 10d070055a4ecc..5a900220046642 100644
--- a/llvm/test/CodeGen/SPARC/64cond.ll
+++ b/llvm/test/CodeGen/SPARC/64cond.ll
@@ -1,10 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=sparc64-pc-openbsd -disable-sparc-leaf-proc | FileCheck %s
 ; Testing 64-bit conditionals. The sparc64 triple is an alias for sparcv9.
 
-; CHECK: cmpri
-; CHECK: cmp %i1, 1
-; CHECK: be %xcc,
-define void @cmpri(ptr %p, i64 %x) {
+define void @cmpri(ptr %p, i64 %x) nounwind {
+; CHECK-LABEL: cmpri:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i1, 1
+; CHECK-NEXT:    be %xcc, .LBB0_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  ! %bb.1: ! %if.then
+; CHECK-NEXT:    stx %i1, [%i0]
+; CHECK-NEXT:  .LBB0_2: ! %if.end
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp eq i64 %x, 1
   br i1 %tobool, label %if.end, label %if.then
@@ -17,10 +26,18 @@ if.end:
   ret void
 }
 
-; CHECK: cmprr
-; CHECK: cmp %i1, %i2
-; CHECK: bgu %xcc,
-define void @cmprr(ptr %p, i64 %x, i64 %y) {
+define void @cmprr(ptr %p, i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: cmprr:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i1, %i2
+; CHECK-NEXT:    bgu %xcc, .LBB1_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  ! %bb.1: ! %if.then
+; CHECK-NEXT:    stx %i1, [%i0]
+; CHECK-NEXT:  .LBB1_2: ! %if.end
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp ugt i64 %x, %y
   br i1 %tobool, label %if.end, label %if.then
@@ -33,67 +50,87 @@ if.end:
   ret void
 }
 
-; CHECK: selecti32_xcc
-; CHECK: cmp %i0, %i1
-; CHECK: movg %xcc, %i2, %i3
-; CHECK: restore %g0, %i3, %o0
-define i32 @selecti32_xcc(i64 %x, i64 %y, i32 %a, i32 %b) {
+define i32 @selecti32_xcc(i64 %x, i64 %y, i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: selecti32_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %xcc, %i2, %i3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i3, %o0
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, i32 %a, i32 %b
   ret i32 %rv
 }
 
-; CHECK: selecti64_xcc
-; CHECK: cmp %i0, %i1
-; CHECK: movg %xcc, %i2, %i3
-; CHECK: restore %g0, %i3, %o0
-define i64 @selecti64_xcc(i64 %x, i64 %y, i64 %a, i64 %b) {
+define i64 @selecti64_xcc(i64 %x, i64 %y, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: selecti64_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %xcc, %i2, %i3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i3, %o0
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, i64 %a, i64 %b
   ret i64 %rv
 }
 
-; CHECK: selecti64_icc
-; CHECK: cmp %i0, %i1
-; CHECK: movg %icc, %i2, %i3
-; CHECK: restore %g0, %i3, %o0
-define i64 @selecti64_icc(i32 %x, i32 %y, i64 %a, i64 %b) {
+define i64 @selecti64_icc(i32 %x, i32 %y, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: selecti64_icc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %icc, %i2, %i3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i3, %o0
 entry:
   %tobool = icmp sgt i32 %x, %y
   %rv = select i1 %tobool, i64 %a, i64 %b
   ret i64 %rv
 }
 
-; CHECK: selecti64_fcc
-; CHECK: mov %i3, %i0
-; CHECK: fcmps %fcc0, %f1, %f3
-; CHECK: movul %fcc0, %i2, %i0
-; CHECK: restore
-define i64 @selecti64_fcc(float %x, float %y, i64 %a, i64 %b) {
+define i64 @selecti64_fcc(float %x, float %y, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: selecti64_fcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    mov %i3, %i0
+; CHECK-NEXT:    fcmps %fcc0, %f1, %f3
+; CHECK-NEXT:    movul %fcc0, %i2, %i0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = fcmp ult float %x, %y
   %rv = select i1 %tobool, i64 %a, i64 %b
   ret i64 %rv
 }
 
-; CHECK: selectf32_xcc
-; CHECK: fmovs %f7, %f0
-; CHECK: cmp %i0, %i1
-; CHECK: fmovsg %xcc, %f5, %f0
-define float @selectf32_xcc(i64 %x, i64 %y, float %a, float %b) {
+define float @selectf32_xcc(i64 %x, i64 %y, float %a, float %b) nounwind {
+; CHECK-LABEL: selectf32_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    fmovs %f7, %f0
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    fmovsg %xcc, %f5, %f0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, float %a, float %b
   ret float %rv
 }
 
-; CHECK: selectf64_xcc
-; CHECK: fmovd %f6, %f0
-; CHECK: cmp %i0, %i1
-; CHECK: fmovdg %xcc, %f4, %f0
-define double @selectf64_xcc(i64 %x, i64 %y, double %a, double %b) {
+define double @selectf64_xcc(i64 %x, i64 %y, double %a, double %b) nounwind {
+; CHECK-LABEL: selectf64_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    fmovd %f6, %f0
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    fmovdg %xcc, %f4, %f0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, double %a, double %b
@@ -101,26 +138,38 @@ entry:
 }
 
 ; The MOVXCC instruction can't use %g0 for its tied operand.
-; CHECK: select_consti64_xcc
-; CHECK: cmp
-; CHECK: movg %xcc, 123, %i{{[0-2]}}
-define i64 @select_consti64_xcc(i64 %x, i64 %y) {
+define i64 @select_consti64_xcc(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: select_consti64_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    mov %g0, %i2
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %xcc, 123, %i2
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i2, %o0
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, i64 123, i64 0
   ret i64 %rv
 }
 
-; CHECK-LABEL: setcc_resultty
-; CHECK-DAG:       mov %g0, %o0
-; CHECK-DAG:       mov %i0, %o1
-; CHECK-DAG:       mov %g0, %o2
-; CHECK-DAG:       mov 32, %o3
-; CHECK-DAG:       call __multi3
-; CHECK:       movrnz %o0, 1, [[R:%[gilo][0-7]]]
-; CHECK:       or [[R]], %i1, %i0
-
-define i1 @setcc_resultty(i64 %a, i1 %b) {
+define i1 @setcc_resultty(i64 %a, i1 %b) nounwind {
+; CHECK-LABEL: setcc_resultty:
+; CHECK:       ! %bb.0:
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    mov %g0, %i2
+; CHECK-NEXT:    sethi 4194303, %i3
+; CHECK-NEXT:    or %i3, 1023, %i3
+; CHECK-NEXT:    sethi 131071, %i4
+; CHECK-NEXT:    or %i4, 1023, %i4
+; CHECK-NEXT:    sllx %i4, 32, %i4
+; CHECK-NEXT:    or %i4, %i3, %i3
+; CHECK-NEXT:    and %i0, %i3, %i3
+; CHECK-NEXT:    cmp %i3, %i0
+; CHECK-NEXT:    movne %xcc, 1, %i2
+; CHECK-NEXT:    or %i2, %i1, %i0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
   %a0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a, i64 32)
   %a1 = extractvalue { i64, i1 } %a0, 1
   %a4 = or i1 %a1, %b
diff --git a/llvm/test/CodeGen/SPARC/fp128-split.ll b/llvm/test/CodeGen/SPARC/fp128-split.ll
index c87cfb932b7819..8a127c9c28cc71 100644
--- a/llvm/test/CodeGen/SPARC/fp128-split.ll
+++ b/llvm/test/CodeGen/SPARC/fp128-split.ll
@@ -8,45 +8,33 @@
 define fp128 @testcase(fp128 %0) {
   ; CHECK-LABEL: name: testcase
   ; CHECK: bb.0.Entry:
-  ; CHECK:   liveins: $q0
-  ; CHECK:   [[COPY:%[0-9]+]]:qfpregs = COPY $q0
-  ; CHECK:   [[COPY1:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_odd64
-  ; CHECK:   [[ADDri:%[0-9]+]]:i64regs = ADDri %stack.0, 0
-  ; CHECK:   [[ORri:%[0-9]+]]:i64regs = ORri killed [[ADDri]], 8
-  ; CHECK:   STDFrr [[ORri]], $g0, killed [[COPY1]] :: (store (s64) into %stack.0 + 8)
-  ; CHECK:   [[COPY2:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_even64
-  ; CHECK:   STDFri %stack.0, 0, killed [[COPY2]] :: (store (s64) into %stack.0, align 16)
-  ; CHECK:   [[LDXrr:%[0-9]+]]:i64regs = LDXrr [[ORri]], $g0 :: (load (s64) from %stack.0 + 8)
-  ; CHECK:   [[LDXri:%[0-9]+]]:i64regs = LDXri %stack.0, 0 :: (load (s64) from %stack.0, align 16)
-  ; CHECK:   [[COPY3:%[0-9]+]]:intregs = COPY [[LDXrr]]
-  ; CHECK:   [[COPY4:%[0-9]+]]:intregs = COPY [[LDXri]]
-  ; CHECK:   [[SRLXri:%[0-9]+]]:i64regs = SRLXri [[LDXrr]], 32
-  ; CHECK:   [[COPY5:%[0-9]+]]:intregs = COPY [[SRLXri]]
-  ; CHECK:   [[SRLXri1:%[0-9]+]]:i64regs = SRLXri [[LDXri]], 32
-  ; CHECK:   [[COPY6:%[0-9]+]]:intregs = COPY [[SRLXri1]]
-  ; CHECK:   [[ADDCCri:%[0-9]+]]:intregs = ADDCCri killed [[COPY3]], -1, implicit-def $icc
-  ; CHECK:   [[ADDEri:%[0-9]+]]:intregs = ADDEri killed [[COPY5]], -1, implicit-def $icc, implicit $icc
-  ; CHECK:   [[ADDEri1:%[0-9]+]]:intregs = ADDEri killed [[COPY4]], -1, implicit-def $icc, implicit $icc
-  ; CHECK:   [[ADDEri2:%[0-9]+]]:intregs = ADDEri killed [[COPY6]], -1, implicit-def dead $icc, implicit $icc
-  ; CHECK:   [[SRLri:%[0-9]+]]:i64regs = SRLri killed [[ADDCCri]], 0
-  ; CHECK:   [[COPY7:%[0-9]+]]:i64regs = COPY [[ADDEri]]
-  ; CHECK:   [[SLLXri:%[0-9]+]]:i64regs = SLLXri killed [[COPY7]], 32
-  ; CHECK:   [[ORrr:%[0-9]+]]:i64regs = ORrr killed [[SLLXri]], killed [[SRLri]]
-  ; CHECK:   [[ADDri1:%[0-9]+]]:i64regs = ADDri %stack.1, 0
-  ; CHECK:   [[ORri1:%[0-9]+]]:i64regs = ORri killed [[ADDri1]], 8
-  ; CHECK:   STXrr [[ORri1]], $g0, killed [[ORrr]] :: (store (s64) into %stack.1 + 8, basealign 16)
-  ; CHECK:   [[SRLri1:%[0-9]+]]:i64regs = SRLri killed [[ADDEri1]], 0
-  ; CHECK:   [[COPY8:%[0-9]+]]:i64regs = COPY [[ADDEri2]]
-  ; CHECK:   [[SLLXri1:%[0-9]+]]:i64regs = SLLXri killed [[COPY8]], 32
-  ; CHECK:   [[ORrr1:%[0-9]+]]:i64regs = ORrr killed [[SLLXri1]], killed [[SRLri1]]
-  ; CHECK:   STXri %stack.1, 0, killed [[ORrr1]] :: (store (s64) into %stack.1, align 16)
-  ; CHECK:   [[LDDFri:%[0-9]+]]:dfpregs = LDDFri %stack.1, 0 :: (load (s64) from %stack.1, align 16)
-  ; CHECK:   [[DEF:%[0-9]+]]:qfpregs = IMPLICIT_DEF
-  ; CHECK:   [[INSERT_SUBREG:%[0-9]+]]:qfpregs = INSERT_SUBREG [[DEF]], killed [[LDDFri]], %subreg.sub_even64
-  ; CHECK:   [[LDDFrr:%[0-9]+]]:dfpregs = LDDFrr [[ORri1]], $g0 :: (load (s64) from %stack.1 + 8)
-  ; CHECK:   [[INSERT_SUBREG1:%[0-9]+]]:qfpregs = INSERT_SUBREG [[INSERT_SUBREG]], killed [[LDDFrr]], %subreg.sub_odd64
-  ; CHECK:   $q0 = COPY [[INSERT_SUBREG1]]
-  ; CHECK:   RETL 8, implicit $q0
+  ; CHECK-NEXT:   liveins: $q0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:qfpregs = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_odd64
+  ; CHECK-NEXT:   [[ADDri:%[0-9]+]]:i64regs = ADDri %stack.0, 0
+  ; CHECK-NEXT:   [[ORri:%[0-9]+]]:i64regs = ORri killed [[ADDri]], 8
+  ; CHECK-NEXT:   STDFrr [[ORri]], $g0, killed [[COPY1]] :: (store (s64) into %stack.0 + 8)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_even64
+  ; CHECK-NEXT:   STDFri %stack.0, 0, killed [[COPY2]] :: (store (s64) into %stack.0, align 16)
+  ; CHECK-NEXT:   [[LDXrr:%[0-9]+]]:i64regs = LDXrr [[ORri]], $g0 :: (load (s64) from %stack.0 + 8)
+  ; CHECK-NEXT:   [[LDXri:%[0-9]+]]:i64regs = LDXri %stack.0, 0 :: (load (s64) from %stack.0, align 16)
+  ; CHECK-NEXT:   [[ADDri1:%[0-9]+]]:i64regs = ADDri %stack.1, 0
+  ; CHECK-NEXT:   [[ORri1:%[0-9]+]]:i64regs = ORri killed [[ADDri1]], 8
+  ; CHECK-NEXT:   [[ADDri2:%[0-9]+]]:i64regs = ADDri [[LDXrr]], -1
+  ; CHECK-NEXT:   STXrr [[ORri1]], $g0, killed [[ADDri2]] :: (store (s64) into %stack.1 + 8, basealign 16)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:intregs = COPY $g0
+  ; CHECK-NEXT:   [[MOVRri:%[0-9]+]]:intregs = MOVRri [[LDXrr]], 1, [[COPY3]], 49
+  ; CHECK-NEXT:   [[SRLri:%[0-9]+]]:i64regs = SRLri killed [[MOVRri]], 0
+  ; CHECK-NEXT:   [[SUBrr:%[0-9]+]]:i64regs = SUBrr killed [[LDXri]], killed [[SRLri]]
+  ; CHECK-NEXT:   STXri %stack.1, 0, killed [[SUBrr]] :: (store (s64) into %stack.1, align 16)
+  ; CHECK-NEXT:   [[LDDFri:%[0-9]+]]:dfpregs = LDDFri %stack.1, 0 :: (load (s64) from %stack.1, align 16)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:qfpregs = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:qfpregs = INSERT_SUBREG [[DEF]], killed [[LDDFri]], %subreg.sub_even64
+  ; CHECK-NEXT:   [[LDDFrr:%[0-9]+]]:dfpregs = LDDFrr [[ORri1]], $g0 :: (load (s64) from %stack.1 + 8)
+  ; CHECK-NEXT:   [[INSERT_SUBREG1:%[0-9]+]]:qfpregs = INSERT_SUBREG [[INSERT_SUBREG]], killed [[LDDFrr]], %subreg.sub_odd64
+  ; CHECK-NEXT:   $q0 = COPY [[INSERT_SUBREG1]]
+  ; CHECK-NEXT:   RETL 8, implicit $q0
 Entry:
   %1 = bitcast fp128 %0 to i128
   %2 = add i128 %1, -1
diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
index ae1de443bce05f..ac0b1128ca812a 100644
--- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
@@ -2,14 +2,10 @@
 ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC
 ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64
 
-define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
+define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:         .cfi_startproc
-; SPARC-NEXT:  ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %start
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    .cfi_def_cfa_register %fp
-; SPARC-NEXT:    .cfi_window_save
-; SPARC-NEXT:    .cfi_register %o7, %i7
 ; SPARC-NEXT:    ld [%fp+96], %l1
 ; SPARC-NEXT:    mov %i3, %g4
 ; SPARC-NEXT:    mov %i2, %g2
@@ -172,105 +168,97 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; SPARC-NEXT:    restore %g0, %g3, %o1
 ;
 ; SPARC64-LABEL: muloti_test:
-; SPARC64:         .cfi_startproc
-; SPARC64-NEXT:    .register %g2, #scratch
+; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
 ; SPARC64-NEXT:  ! %bb.0: ! %start
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    .cfi_def_cfa_register %fp
-; SPARC64-NEXT:    .cfi_window_save
-; SPARC64-NEXT:    .cfi_register %o7, %i7
-; SPARC64-NEXT:    mov %i3, %i4
-; SPARC64-NEXT:    mov %i1, %i3
-; SPARC64-NEXT:    srax %i0, 63, %o2
-; SPARC64-NEXT:    mov %i2, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %o2, %o3
-; SPARC64-NEXT:    mov %o0, %i1
-; SPARC64-NEXT:    mov %o1, %i5
-; SPARC64-NEXT:    srax %i2, 63, %o0
-; SPARC64-NEXT:    mov %o0, %o1
-; SPARC64-NEXT:    mov %i0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    srlx %i5, 32, %g2
-; SPARC64-NEXT:    srlx %o1, 32, %g3
-; SPARC64-NEXT:    srlx %i1, 32, %g4
-; SPARC64-NEXT:    srlx %o0, 32, %g5
-; SPARC64-NEXT:    addcc %o1, %i5, %l0
-; SPARC64-NEXT:    addxcc %g3, %g2, %l1
-; SPARC64-NEXT:    addxcc %o0, %i1, %l2
-; SPARC64-NEXT:    addxcc %g5, %g4, %l3
+; SPARC64-NEXT:    mov %i3, %i5
+; SPARC64-NEXT:    mov %i2, %i3
+; SPARC64-NEXT:    mov %i1, %i2
+; SPARC64-NEXT:    mov %i0, %i4
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %i1, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
-; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %i5, %o3
+; SPARC64-NEXT:    mov %o0, %i0
 ; SPARC64-NEXT:    mov %o1, %i1
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i0, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
-; SPARC64-NEXT:    srlx %i5, 32, %i4
-; SPARC64-NEXT:    srlx %o1, 32, %g2
-; SPARC64-NEXT:    srlx %o0, 32, %g3
-; SPARC64-NEXT:    addcc %o1, %i5, %i5
-; SPARC64-NEXT:    addxcc %g2, %i4, %i4
-; SPARC64-NEXT:    addxcc %o0, 0, %l4
-; SPARC64-NEXT:    addxcc %g3, 0, %l5
+; SPARC64-NEXT:    mov %i5, %o3
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    add %o1, %i0, %i0
+; SPARC64-NEXT:    cmp %i0, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %g2
+; SPARC64-NEXT:    add %o0, %g2, %l0
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %i2, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i2, %o3
-; SPARC64-NEXT:    srlx %o1, 32, %g2
-; SPARC64-NEXT:    srlx %o0, 32, %g3
-; SPARC64-NEXT:    addcc %o1, %i5, %i3
-; SPARC64-NEXT:    addxcc %g2, %i4, %i4
-; SPARC64-NEXT:    addxcc %o0, 0, %i5
-; SPARC64-NEXT:    addxcc %g3, 0, %g2
-; SPARC64-NEXT:    addcc %l4, %i5, %i5
-; SPARC64-NEXT:    addxcc %l5, %g2, %l4
-; SPARC64-NEXT:    addxcc %g0, 0, %l5
-; SPARC64-NEXT:    addxcc %g0, 0, %l6
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    mov %g0, %g3
+; SPARC64-NEXT:    add %o1, %i0, %i0
+; SPARC64-NEXT:    cmp %i0, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %g2
+; SPARC64-NEXT:    add %o0, %g2, %g2
+; SPARC64-NEXT:    add %l0, %g2, %l1
+; SPARC64-NEXT:    cmp %l1, %l0
+; SPARC64-NEXT:    movcs %xcc, 1, %g3
+; SPARC64-NEXT:    srl %g3, 0, %l0
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i0, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    add %o0, %l0, %g3
+; SPARC64-NEXT:    add %o1, %l1, %l1
+; SPARC64-NEXT:    cmp %l1, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %g2
+; SPARC64-NEXT:    add %g3, %g2, %l2
+; SPARC64-NEXT:    srax %i4, 63, %o2
+; SPARC64-NEXT:    mov %i3, %o0
+; SPARC64-NEXT:    mov %i5, %o1
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %o2, %o3
+; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %o1, %l0
+; SPARC64-NEXT:    srax %i3, 63, %o0
+; SPARC64-NEXT:    mov %o0, %o1
+; SPARC64-NEXT:    mov %i4, %o2
+; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i2, %o3
 ; SPARC64-NEXT:    mov %g0, %i2
-; SPARC64-NEXT:    srlx %o1, 32, %i0
-; SPARC64-NEXT:    addcc %o1, %i5, %i5
-; SPARC64-NEXT:    srlx %o0, 32, %g2
-; SPARC64-NEXT:    addxcc %i0, %l4, %i0
-; SPARC64-NEXT:    addxcc %o0, %l5, %g3
-; SPARC64-NEXT:    addxcc %g2, %l6, %g2
-; SPARC64-NEXT:    addcc %i5, %l0, %i5
-; SPARC64-NEXT:    addxcc %i0, %l1, %i0
-; SPARC64-NEXT:    addxcc %g3, %l2, %g3
-; SPARC64-NEXT:    addxcc %g2, %l3, %g2
-; SPARC64-NEXT:    srl %g3, 0, %g3
-; SPARC64-NEXT:    sllx %g2, 32, %g2
-; SPARC64-NEXT:    or %g2, %g3, %g2
-; SPARC64-NEXT:    sllx %i4, 32, %i4
-; SPARC64-NEXT:    srax %i4, 63, %g3
-; SPARC64-NEXT:    xor %g2, %g3, %g2
-; SPARC64-NEXT:    srl %i5, 0, %i5
-; SPARC64-NEXT:    sllx %i0, 32, %i0
-; SPARC64-NEXT:    or %i0, %i5, %i0
-; SPARC64-NEXT:    xor %i0, %g3, %i0
-; SPARC64-NEXT:    or %i0, %g2, %i0
-; SPARC64-NEXT:    movrnz %i0, 1, %i2
-; SPARC64-NEXT:    srl %i3, 0, %i0
-; SPARC64-NEXT:    or %i4, %i0, %i0
+; SPARC64-NEXT:    mov %g0, %i3
+; SPARC64-NEXT:    mov %g0, %i4
+; SPARC64-NEXT:    add %o0, %i5, %i5
+; SPARC64-NEXT:    add %o1, %l0, %g2
+; SPARC64-NEXT:    cmp %g2, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %i2
 ; SPARC64-NEXT:    srl %i2, 0, %i2
+; SPARC64-NEXT:    add %i5, %i2, %i2
+; SPARC64-NEXT:    add %l2, %i2, %i2
+; SPARC64-NEXT:    add %l1, %g2, %i5
+; SPARC64-NEXT:    cmp %i5, %l1
+; SPARC64-NEXT:    movcs %xcc, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i3
+; SPARC64-NEXT:    add %i2, %i3, %i2
+; SPARC64-NEXT:    srax %i0, 63, %i3
+; SPARC64-NEXT:    xor %i2, %i3, %i2
+; SPARC64-NEXT:    xor %i5, %i3, %i3
+; SPARC64-NEXT:    or %i3, %i2, %i2
+; SPARC64-NEXT:    movrnz %i2, 1, %i4
+; SPARC64-NEXT:    srl %i4, 0, %i2
 ; SPARC64-NEXT:    ret
 ; SPARC64-NEXT:    restore
 start:
-  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r) #2
+  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r)
   %1 = extractvalue { i128, i1 } %0, 0
   %2 = extractvalue { i128, i1 } %0, 1
   %3 = zext i1 %2 to i8
@@ -279,9 +267,4 @@ start:
   ret { i128, i8 } %5
 }
 
-; Function Attrs: nounwind readnone speculatable
-declare { i128, i1 } @llvm.smul.with.overflow.i128(i128, i128) #1
-
-attributes #0 = { nounwind readnone uwtable }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { nounwind }
+declare { i128, i1 } @llvm.smul.with.overflow.i128(i128, i128)
diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
index 9ca895fe78073d..01383a00c26193 100644
--- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
@@ -2,14 +2,10 @@
 ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC
 ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64
 
-define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
+define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:         .cfi_startproc
-; SPARC-NEXT:  ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %start
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    .cfi_def_cfa_register %fp
-; SPARC-NEXT:    .cfi_window_save
-; SPARC-NEXT:    .cfi_register %o7, %i7
 ; SPARC-NEXT:    mov %i3, %g2
 ; SPARC-NEXT:    mov %i2, %g4
 ; SPARC-NEXT:    umul %i2, %i5, %i2
@@ -160,14 +156,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; SPARC-NEXT:    restore %g0, %g2, %o1
 ;
 ; SPARC64-LABEL: muloti_test:
-; SPARC64:         .cfi_startproc
-; SPARC64-NEXT:    .register %g2, #scratch
+; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
 ; SPARC64-NEXT:  ! %bb.0: ! %start
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    .cfi_def_cfa_register %fp
-; SPARC64-NEXT:    .cfi_window_save
-; SPARC64-NEXT:    .cfi_register %o7, %i7
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i2, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
@@ -208,7 +200,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; SPARC64-NEXT:    ret
 ; SPARC64-NEXT:    restore %g0, %o1, %o1
 start:
-  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
+  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r)
   %1 = extractvalue { i128, i1 } %0, 0
   %2 = extractvalue { i128, i1 } %0, 1
   %3 = zext i1 %2 to i8
@@ -217,9 +209,4 @@ start:
   ret { i128, i8 } %5
 }
 
-; Function Attrs: nounwind readnone speculatable
-declare { i128, i1 } @llvm.umul.with.overflow.i128(i128, i128) #1
-
-attributes #0 = { nounwind readnone uwtable }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { nounwind }
+declare { i128, i1 } @llvm.umul.with.overflow.i128(i128, i128)
diff --git a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
index cfec52c0e68863..835cd9f509b0d7 100644
--- a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
@@ -177,6 +177,90 @@ define float @tan(float %x) #0 {
   ret float %result
 }
 
+define float @acos(float %x) #0 {
+; CHECK-LABEL: acos:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl   %eax
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _acosf
+; CHECK-NEXT:    popl    %eax
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.acos.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @asin(float %x) #0 {
+; CHECK-LABEL: asin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl   %eax
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _asinf
+; CHECK-NEXT:    popl    %eax
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.asin.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @atan(float %x) #0 {
+; CHECK-LABEL: atan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl   %eax
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _atanf
+; CHECK-NEXT:    popl    %eax
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.atan.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @cosh(float %x) #0 {
+; CHECK-LABEL: cosh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl   %eax
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _coshf
+; CHECK-NEXT:    popl    %eax
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.cosh.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @sinh(float %x) #0 {
+; CHECK-LABEL: sinh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl   %eax
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _sinhf
+; CHECK-NEXT:    popl    %eax
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.sinh.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @tanh(float %x) #0 {
+; CHECK-LABEL: tanh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl   %eax
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstps (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _tanhf
+; CHECK-NEXT:    popl    %eax
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.tanh.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
 attributes #0 = { strictfp }
 
 declare float @llvm.experimental.constrained.ceil.f32(float, metadata)
@@ -189,3 +273,9 @@ declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.acos.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.asin.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.atan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.cosh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.sinh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.tanh.f32(float, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fsafdo_test1.ll b/llvm/test/CodeGen/X86/fsafdo_test1.ll
index 61c0f59aba6f81..e80a7f2f354f2d 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test1.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test1.ll
@@ -4,9 +4,9 @@
 ; Check that fs-afdo discriminators are generated.
 ; V01: .loc    1 7 3 is_stmt 0 discriminator 2 # foo.c:7:3
 ; V01: .loc    1 9 5 is_stmt 1 discriminator 2 # foo.c:9:5
-; V0: .loc    1 9 5 is_stmt 0 discriminator 11266 # foo.c:9:5
+; V0: .loc    1 9 5 discriminator 11266 # foo.c:9:5
 ; V0: .loc    1 7 3 is_stmt 1 discriminator 11266 # foo.c:7:3
-; V1: .loc    1 9 5 is_stmt 0 discriminator 514 # foo.c:9:5
+; V1: .loc    1 9 5 discriminator 514 # foo.c:9:5
 ; V1: .loc    1 7 3 is_stmt 1 discriminator 258 # foo.c:7:3
 ; Check that variable __llvm_fs_discriminator__ is generated.
 ; V01: .type   __llvm_fs_discriminator__,@object # @__llvm_fs_discriminator__
diff --git a/llvm/test/CodeGen/X86/fsafdo_test4.ll b/llvm/test/CodeGen/X86/fsafdo_test4.ll
index 6a22ea98224123..effc72b44ade80 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test4.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test4.ll
@@ -1,11 +1,16 @@
-; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=false < %s | FileCheck %s
-; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=true < %s | FileCheck %s
+; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=false < %s | FileCheck --implicit-check-not=.loc %s
+; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=true < %s | FileCheck --implicit-check-not=.loc %s
 ;
 ; Check that fs-afdo discriminators are NOT generated, as debugInfoForProfiling is false (not set).
+; CHECK: .loc    1 7 15 prologue_end discriminator 2 # foo.c:7:15
 ; CHECK: .loc    1 7 3 is_stmt 0 discriminator 2 # foo.c:7:3
+; CHECK: .loc    1 0 3 # foo.c:0:3
 ; CHECK: .loc    1 9 5 is_stmt 1 discriminator 2 # foo.c:9:5
-; CHECK-NOT: .loc    1 9 5 is_stmt 0 discriminator
-; CHECK-NOT: .loc    1 7 3 is_stmt 1 discriminator
+; CHECK: .loc    1 0 5 is_stmt 0 # :0:5
+; CHECK: .loc    1 9 5 discriminator 2 # foo.c:9:5
+; CHECK: .loc    1 0 5 # :0:5
+; CHECK: .loc	   1 7 3 is_stmt 1 discriminator 2 # foo.c:7:3
+; CHECK: .loc    1 14 3 # foo.c:14:3
 ; Check that variable __llvm_fs_discriminator__ is NOT generated.
 ; CHECK-NOT: __llvm_fs_discriminator__:
 
diff --git a/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll b/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
index 52d0c2b509128b..629f44b52bc059 100644
--- a/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
+++ b/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
@@ -17,4 +17,22 @@ entry:
   ret i32 %b
 }
 
+define i32 @foo2(i32 %arg1) #1 {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $31, %edi
+; CHECK-NEXT:    movzwl -72(%rsp,%rdi,2), %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %a = extractelement <32 x i16> zeroinitializer, i32 %arg1
+  %b = zext i16 %a to i32
+  ret i32 %b
+}
+
 attributes #0 = { "no-realign-stack" "target-cpu"="skylake-avx512" }
+attributes #1 = { "no-realign-stack" "target-cpu"="skylake" }
diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll
index 6857101d3d75bb..b107b1c2749ccb 100644
--- a/llvm/test/CodeGen/X86/vec-libcalls.ll
+++ b/llvm/test/CodeGen/X86/vec-libcalls.ll
@@ -25,6 +25,54 @@ declare <5 x float> @llvm.tan.v5f32(<5 x float>)
 declare <6 x float> @llvm.tan.v6f32(<6 x float>)
 declare <3 x double> @llvm.tan.v3f64(<3 x double>)
 
+declare <1 x float> @llvm.acos.v1f32(<1 x float>)
+declare <2 x float> @llvm.acos.v2f32(<2 x float>)
+declare <3 x float> @llvm.acos.v3f32(<3 x float>)
+declare <4 x float> @llvm.acos.v4f32(<4 x float>)
+declare <5 x float> @llvm.acos.v5f32(<5 x float>)
+declare <6 x float> @llvm.acos.v6f32(<6 x float>)
+declare <3 x double> @llvm.acos.v3f64(<3 x double
+>)
+declare <1 x float> @llvm.asin.v1f32(<1 x float>)
+declare <2 x float> @llvm.asin.v2f32(<2 x float>)
+declare <3 x float> @llvm.asin.v3f32(<3 x float>)
+declare <4 x float> @llvm.asin.v4f32(<4 x float>)
+declare <5 x float> @llvm.asin.v5f32(<5 x float>)
+declare <6 x float> @llvm.asin.v6f32(<6 x float>)
+declare <3 x double> @llvm.asin.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.atan.v1f32(<1 x float>)
+declare <2 x float> @llvm.atan.v2f32(<2 x float>)
+declare <3 x float> @llvm.atan.v3f32(<3 x float>)
+declare <4 x float> @llvm.atan.v4f32(<4 x float>)
+declare <5 x float> @llvm.atan.v5f32(<5 x float>)
+declare <6 x float> @llvm.atan.v6f32(<6 x float>)
+declare <3 x double> @llvm.atan.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.cosh.v1f32(<1 x float>)
+declare <2 x float> @llvm.cosh.v2f32(<2 x float>)
+declare <3 x float> @llvm.cosh.v3f32(<3 x float>)
+declare <4 x float> @llvm.cosh.v4f32(<4 x float>)
+declare <5 x float> @llvm.cosh.v5f32(<5 x float>)
+declare <6 x float> @llvm.cosh.v6f32(<6 x float>)
+declare <3 x double> @llvm.cosh.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.sinh.v1f32(<1 x float>)
+declare <2 x float> @llvm.sinh.v2f32(<2 x float>)
+declare <3 x float> @llvm.sinh.v3f32(<3 x float>)
+declare <4 x float> @llvm.sinh.v4f32(<4 x float>)
+declare <5 x float> @llvm.sinh.v5f32(<5 x float>)
+declare <6 x float> @llvm.sinh.v6f32(<6 x float>)
+declare <3 x double> @llvm.sinh.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.tanh.v1f32(<1 x float>)
+declare <2 x float> @llvm.tanh.v2f32(<2 x float>)
+declare <3 x float> @llvm.tanh.v3f32(<3 x float>)
+declare <4 x float> @llvm.tanh.v4f32(<4 x float>)
+declare <5 x float> @llvm.tanh.v5f32(<5 x float>)
+declare <6 x float> @llvm.tanh.v6f32(<6 x float>)
+declare <3 x double> @llvm.tanh.v3f64(<3 x double>)
+
 ; Verify that all of the potential libcall candidates are handled.
 ; Some of these have custom lowering, so those cases won't have
 ; libcalls.
@@ -432,6 +480,1170 @@ define <3 x double> @tan_v3f64(<3 x double> %x) nounwind {
   ret <3 x double> %r
 }
 
+define <1 x float> @acos_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: acos_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.acos.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @acos_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: acos_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.acos.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @acos_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: acos_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.acos.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @acos_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: acos_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.acos.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @acos_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: acos_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.acos.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @acos_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: acos_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.acos.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @acos_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: acos_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acos@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acos@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acos@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.acos.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @asin_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: asin_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.asin.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @asin_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: asin_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.asin.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @asin_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: asin_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.asin.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @asin_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: asin_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.asin.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @asin_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: asin_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.asin.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @asin_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: asin_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.asin.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @asin_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: asin_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asin@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asin@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asin@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.asin.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @atan_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: atan_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.atan.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @atan_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: atan_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.atan.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @atan_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: atan_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.atan.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @atan_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: atan_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.atan.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @atan_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: atan_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.atan.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @atan_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: atan_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.atan.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @atan_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: atan_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atan@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atan@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atan@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.atan.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @cosh_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.cosh.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @cosh_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.cosh.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @cosh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.cosh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @cosh_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.cosh.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @cosh_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.cosh.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @cosh_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.cosh.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @cosh_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: cosh_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq cosh@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq cosh@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq cosh@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.cosh.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @sinh_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.sinh.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @sinh_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.sinh.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @sinh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.sinh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @sinh_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.sinh.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @sinh_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.sinh.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @sinh_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.sinh.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @sinh_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: sinh_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinh@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinh@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinh@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.sinh.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @tanh_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.tanh.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @tanh_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.tanh.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @tanh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.tanh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @tanh_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.tanh.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @tanh_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.tanh.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @tanh_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.tanh.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @tanh_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: tanh_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanh@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanh@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanh@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.tanh.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
 define <2 x float> @fabs_v2f32(<2 x float> %x) nounwind {
 ; CHECK-LABEL: fabs_v2f32:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/DebugInfo/X86/loop-align-debug.ll b/llvm/test/DebugInfo/X86/loop-align-debug.ll
new file mode 100644
index 00000000000000..a0302d08faa0c3
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/loop-align-debug.ll
@@ -0,0 +1,55 @@
+; RUN: llc %s --filetype=obj -o %t
+; RUN: llvm-objdump -d %t | FileCheck %s --check-prefixes=OBJ
+; RUN: llvm-dwarfdump --debug-line %t | FileCheck %s --check-prefixes=DBG
+; RUN: llc %s -o - | FileCheck %s --check-prefixes=ASM
+
+; OBJ: 1:{{.*}}nop
+
+;;     Address            Line   Column File   ISA Discriminator OpIndex Flags
+; DBG: 0x0000000000000000      3      0      0   0             0       0  is_stmt
+; DBG: 0x0000000000000001      0      0      0   0             0       0
+; DBG: 0x0000000000000010      5      0      0   0             0       0  is_stmt prologue_end
+; DBG: 0x0000000000000017      5      0      0   0             0       0  is_stmt end_sequence
+
+; ASM:      .loc    0 0 0 is_stmt 0
+; ASM-NEXT: .L{{.*}}:
+; ASM-NEXT: .p2align        4, 0x90
+
+;; $ cat test.cpp
+;; void g();
+;; void f() {
+;;   [[clang::code_align(16)]]
+;;   while (1) { g(); }
+;; }
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @f() local_unnamed_addr !dbg !9 {
+entry:
+  br label %while.body, !dbg !12
+
+while.body:                                       ; preds = %entry, %while.body
+  tail call void @g(), !dbg !12
+  br label %while.body, !dbg !12, !llvm.loop !13
+}
+
+declare !dbg !16 void @g() local_unnamed_addr
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 19.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.cpp", directory: "/")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{!"clang version 19.0.0git"}
+!9 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !10, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = !DISubroutineType(types: !11)
+!11 = !{}
+!12 = !DILocation(line: 5, scope: !9)
+!13 = distinct !{!13, !12, !12, !14, !15}
+!14 = !{!"llvm.loop.mustprogress"}
+!15 = !{!"llvm.loop.align", i32 16}
+!16 = !DISubprogram(name: "g", scope: !1, file: !1, line: 2, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
diff --git a/llvm/test/Transforms/GlobalOpt/x86_mmx_load.ll b/llvm/test/Transforms/GlobalOpt/x86_mmx_load.ll
deleted file mode 100644
index e352900e2a458e..00000000000000
--- a/llvm/test/Transforms/GlobalOpt/x86_mmx_load.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=globalopt < %s | FileCheck %s
-
-@m64 = internal global <1 x i64> zeroinitializer
-
-define i32 @load_mmx() {
-; CHECK-LABEL: @load_mmx(
-; CHECK-NEXT:    ret i32 0
-;
-  %temp = load x86_mmx, ptr @m64
-  ret i32 0
-}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index 1627292732b6a8..38039217998584 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s --check-prefix=CHECK-AVX-VF2
 ; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=+avx512f -S < %s | FileCheck %s --check-prefix=CHECK-AVX512-VF8
 ; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -mattr=+avx512f -S < %s | FileCheck %s --check-prefix=CHECK-AVX512-VF16
 
@@ -20,6 +21,27 @@ declare float @tanf(float) #0
 declare double @llvm.tan.f64(double) #0
 declare float @llvm.tan.f32(float) #0
 
+declare float @acosf(float) #0
+declare float @llvm.acos.f32(float) #0
+
+declare double @asin(double) #0
+declare float @asinf(float) #0
+declare double @llvm.asin.f64(double) #0
+declare float @llvm.asin.f32(float) #0
+
+declare double @atan(double) #0
+declare float @atanf(float) #0
+declare double @llvm.atan.f64(double) #0
+declare float @llvm.atan.f32(float) #0
+
+declare double @cosh(double) #0
+declare float @coshf(float) #0
+declare double @llvm.cosh.f64(double) #0
+declare float @llvm.cosh.f32(float) #0
+
+declare float @tanhf(float) #0
+declare float @llvm.tanh.f32(float) #0
+
 declare double @pow(double, double) #0
 declare float @powf(float, float) #0
 declare double @llvm.pow.f64(double, double) #0
@@ -274,6 +296,10 @@ define void @tan_f64(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @tan_f64(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @tan_f64(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -328,6 +354,10 @@ define void @tan_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @tan_f64_intrinsic(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @tan_f64_intrinsic(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -377,6 +407,360 @@ for.end:
   ret void
 }
 
+define void @acos_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @acos_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @acosf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @acos_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @acos_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.acos.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f64(ptr nocapture %varray) {
+; CHECK-AVX512-VF8-LABEL: @asin_f64(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @asin(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @asin_f32(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @asinf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-AVX512-VF8-LABEL: @asin_f64_intrinsic(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.asin.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @asin_f32_intrinsic(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.asin.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX-VF2-LABEL: @atan_f64(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
+; CHECK-AVX512-VF8-LABEL: @atan_f64(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @atan(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @atan_f32(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @atanf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX-VF2-LABEL: @atan_f64_intrinsic(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
+; CHECK-AVX512-VF8-LABEL: @atan_f64_intrinsic(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.atan.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @atan_f32_intrinsic(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.atan.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cosh_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @cosh_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @coshf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cosh_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @cosh_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.cosh.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @tanh_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @tanh_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tanhf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @tanh_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @tanh_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.tanh.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64(
 ; CHECK:    [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
diff --git a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
index 27038f3a24b664..b3a5836c6bbde3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
@@ -456,6 +456,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @asin_f32_intrinsic(
+;CHECK: vasinf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.asin.f32(float) nounwind readnone
+define void @asin_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.asin.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @acos_f32(
 ;CHECK: vacosf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -481,6 +506,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @acos_f32_intrinsic(
+;CHECK: vacosf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.acos.f32(float) nounwind readnone
+define void @acos_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.acos.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @atan_f32(
 ;CHECK: vatanf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -506,6 +556,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @atan_f32_intrinsic(
+;CHECK: vatanf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.atan.f32(float) nounwind readnone
+define void @atan_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.atan.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @sinh_f32(
 ;CHECK: vsinhf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -531,6 +606,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @sinh_f32_intrinsic(
+;CHECK: vsinhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.sinh.f32(float) nounwind readnone
+define void @sinh_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.sinh.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @cosh_f32(
 ;CHECK: vcoshf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -556,6 +656,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @cosh_f32_intrinsic(
+;CHECK: vcoshf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.cosh.f32(float) nounwind readnone
+define void @cosh_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.cosh.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @tanh_f32(
 ;CHECK: vtanhf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -581,6 +706,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @tanh_f32_intrinsic(
+;CHECK: vtanhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.tanh.f32(float) nounwind readnone
+define void @tanh_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.tanh.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @asinh_f32(
 ;CHECK: vasinhf{{.*}}<4 x float>
 ;CHECK: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
new file mode 100644
index 00000000000000..26be5556bb90d0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -0,0 +1,366 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; FIXME: Should not vectorize on gfx8
+
+; GCN-LABEL: @fadd_combine_v2f16
+; GCN: fadd <2 x half>
+define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fadd half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fadd half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fsub_combine_v2f16
+; GCN: fsub <2 x half>
+define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fsub half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fsub half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fmul_combine_v2f16
+; GCN: fmul <2 x half>
+define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fmul half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fmul half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @fdiv_combine_v2f16
+; GCN: fdiv <2 x half>
+define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fdiv half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fdiv half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @frem_combine_v2f16
+; GCN: frem <2 x half>
+define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = frem half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = frem half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fma_combine_v2f16
+; GCN: call <2 x half> @llvm.fma.v2f16
+define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = tail call half @llvm.fma.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = tail call half @llvm.fma.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fmuladd_combine_v2f16
+; GCN: call <2 x half> @llvm.fmuladd.v2f16
+define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = tail call half @llvm.fmuladd.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = tail call half @llvm.fmuladd.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @minnum_combine_v2f16
+; GFX8: call half @llvm.minnum.f16(
+; GFX8: call half @llvm.minnum.f16(
+
+; GFX9: call <2 x half> @llvm.minnum.v2f16
+define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.minnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.minnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @maxnum_combine_v2f16
+; GFX8: call half @llvm.maxnum.f16(
+; GFX8: call half @llvm.maxnum.f16(
+
+; GFX9: call <2 x half> @llvm.maxnum.v2f16
+define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.maxnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.maxnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should vectorize
+; GCN-LABEL: @minimum_combine_v2f16
+; GCN: call half @llvm.minimum.f16(
+; GCN: call half @llvm.minimum.f16(
+define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.minimum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.minimum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @maximum_combine_v2f16
+; GCN: call half @llvm.maximum.f16(
+; GCN: call half @llvm.maximum.f16(
+define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.maximum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.maximum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @canonicalize_combine_v2f16
+; GFX8: call half @llvm.canonicalize.f16(
+; GFX8: call half @llvm.canonicalize.f16(
+
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
+define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @fabs_combine_v2f16
+; GCN: call <2 x half> @llvm.fabs.v2f16(
+define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.fabs.f16(half %tmp3)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.fabs.f16(half %tmp7)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @fneg_combine_v2f16
+; GCN: fneg <2 x half>
+define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fneg half %tmp3
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fneg half %tmp7
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should vectorize
+; GCN-LABEL: @copysign_combine_v2f16
+; GCN: call half @llvm.copysign.f16(
+; GCN: call half @llvm.copysign.f16(
+define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should vectorize
+; GCN-LABEL: @copysign_combine_v4f16
+; GCN: call half @llvm.copysign.f16(
+; GCN: call half @llvm.copysign.f16(
+; GCN: call half @llvm.copysign.f16(
+; GCN: call half @llvm.copysign.f16(
+define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+
+  %tmp9 = add nuw nsw i64 %tmp1, 2
+  %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
+  %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp12 = call half @llvm.copysign.f16(half %tmp11, half %sign)
+  store half %tmp12, ptr addrspace(1) %tmp10, align 2
+
+  %tmp13 = add nuw nsw i64 %tmp1, 3
+  %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
+  %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
+  %tmp16 = call half @llvm.copysign.f16(half %tmp15, half %sign)
+  store half %tmp16, ptr addrspace(1) %tmp14, align 2
+  ret void
+}
+
+; GCN-LABEL: @canonicalize_combine_v4f16
+; GFX8: call half @llvm.canonicalize.f16(
+; GFX8: call half @llvm.canonicalize.f16(
+
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
+define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+
+  %tmp9 = add nuw nsw i64 %tmp1, 2
+  %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
+  %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp12 = call half @llvm.canonicalize.f16(half %tmp11)
+  store half %tmp12, ptr addrspace(1) %tmp10, align 2
+
+  %tmp13 = add nuw nsw i64 %tmp1, 3
+  %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
+  %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
+  %tmp16 = call half @llvm.canonicalize.f16(half %tmp15)
+  store half %tmp16, ptr addrspace(1) %tmp14, align 2
+  ret void
+}
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index eac72ee67867c7..0de89a55da513d 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -2317,6 +2317,14 @@ TYPED_TEST(MutableConstTest, ICmp) {
   EXPECT_FALSE(m_SpecificCmp(ICmpInst::getInversePredicate(Pred),
                              m_Value(MatchL), m_Value(MatchR))
                    .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
+
+  EXPECT_TRUE(m_c_SpecificICmp(Pred, m_Specific(L), m_Specific(R))
+                  .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
+  EXPECT_TRUE(m_c_SpecificICmp(ICmpInst::getSwappedPredicate(Pred),
+                               m_Specific(R), m_Specific(L))
+                  .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
+  EXPECT_FALSE(m_c_SpecificICmp(Pred, m_Specific(R), m_Specific(L))
+                   .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
 }
 
 TYPED_TEST(MutableConstTest, FCmp) {
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 7edfe457c32a9c..122508d15194f4 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -764,11 +764,23 @@ define void @foo(ptr %arg0, ptr %arg1) {
   // Check create(InsertBefore)
   sandboxir::LoadInst *NewLd =
       sandboxir::LoadInst::create(Ld->getType(), Arg1, Align(8),
-                                  /*InsertBefore=*/Ret, Ctx, "NewLd");
+                                  /*InsertBefore=*/Ret, Ctx,
+                                  /*IsVolatile=*/false, "NewLd");
+  // Checking if create() was volatile
+  EXPECT_FALSE(NewLd->isVolatile());
   EXPECT_EQ(NewLd->getType(), Ld->getType());
   EXPECT_EQ(NewLd->getPointerOperand(), Arg1);
   EXPECT_EQ(NewLd->getAlign(), 8);
   EXPECT_EQ(NewLd->getName(), "NewLd");
+
+  sandboxir::LoadInst *NewVLd =
+      sandboxir::LoadInst::create(Vld->getType(), Arg1, Align(8),
+                                  /*InsertBefore=*/Ret, Ctx,
+                                  /*IsVolatile=*/true, "NewVLd");
+
+  // Checking if create() was volatile
+  EXPECT_TRUE(NewVLd->isVolatile());
+  EXPECT_EQ(NewVLd->getName(), "NewVLd");
 }
 
 TEST_F(SandboxIRTest, StoreInst) {
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
index 0eefe06055b7db..38e4d8f245e4fa 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
@@ -32,37 +32,37 @@ namespace omp {
 
 struct AlignedClauseOps {
   llvm::SmallVector<Value> alignedVars;
-  llvm::SmallVector<Attribute> alignmentAttrs;
+  llvm::SmallVector<Attribute> alignments;
 };
 
 struct AllocateClauseOps {
-  llvm::SmallVector<Value> allocatorVars, allocateVars;
+  llvm::SmallVector<Value> allocateVars, allocatorVars;
 };
 
 struct CancelDirectiveNameClauseOps {
-  ClauseCancellationConstructTypeAttr cancelDirectiveNameAttr;
-};
-
-struct CollapseClauseOps {
-  llvm::SmallVector<Value> loopLBVar, loopUBVar, loopStepVar;
+  ClauseCancellationConstructTypeAttr cancelDirective;
 };
 
 struct CopyprivateClauseOps {
   llvm::SmallVector<Value> copyprivateVars;
-  llvm::SmallVector<Attribute> copyprivateFuncs;
+  llvm::SmallVector<Attribute> copyprivateSyms;
 };
 
 struct CriticalNameClauseOps {
-  StringAttr criticalNameAttr;
+  /// This field has a generic name because it's mirroring the `sym_name`
+  /// argument of the `OpenMP_CriticalNameClause` tablegen definition. That one
+  /// can't be renamed to anything more specific because the `sym_name` name is
+  /// a requirement of the `Symbol` MLIR trait associated with that clause.
+  StringAttr symName;
 };
 
 struct DependClauseOps {
-  llvm::SmallVector<Attribute> dependTypeAttrs;
+  llvm::SmallVector<Attribute> dependKinds;
   llvm::SmallVector<Value> dependVars;
 };
 
 struct DeviceClauseOps {
-  Value deviceVar;
+  Value device;
 };
 
 struct DeviceTypeClauseOps {
@@ -71,26 +71,26 @@ struct DeviceTypeClauseOps {
 };
 
 struct DistScheduleClauseOps {
-  UnitAttr distScheduleStaticAttr;
-  Value distScheduleChunkSizeVar;
+  UnitAttr distScheduleStatic;
+  Value distScheduleChunkSize;
 };
 
 struct DoacrossClauseOps {
-  llvm::SmallVector<Value> doacrossVectorVars;
-  ClauseDependAttr doacrossDependTypeAttr;
-  IntegerAttr doacrossNumLoopsAttr;
+  ClauseDependAttr doacrossDependType;
+  IntegerAttr doacrossNumLoops;
+  llvm::SmallVector<Value> doacrossDependVars;
 };
 
 struct FilterClauseOps {
-  Value filteredThreadIdVar;
+  Value filteredThreadId;
 };
 
 struct FinalClauseOps {
-  Value finalVar;
+  Value final;
 };
 
 struct GrainsizeClauseOps {
-  Value grainsizeVar;
+  Value grainsize;
 };
 
 struct HasDeviceAddrClauseOps {
@@ -98,7 +98,7 @@ struct HasDeviceAddrClauseOps {
 };
 
 struct HintClauseOps {
-  IntegerAttr hintAttr;
+  IntegerAttr hint;
 };
 
 struct IfClauseOps {
@@ -107,8 +107,8 @@ struct IfClauseOps {
 
 struct InReductionClauseOps {
   llvm::SmallVector<Value> inReductionVars;
-  llvm::SmallVector<bool> inReductionVarsByRef;
-  llvm::SmallVector<Attribute> inReductionDeclSymbols;
+  llvm::SmallVector<bool> inReductionByref;
+  llvm::SmallVector<Attribute> inReductionSyms;
 };
 
 struct IsDevicePtrClauseOps {
@@ -120,7 +120,8 @@ struct LinearClauseOps {
 };
 
 struct LoopRelatedOps {
-  UnitAttr loopInclusiveAttr;
+  llvm::SmallVector<Value> loopLowerBounds, loopUpperBounds, loopSteps;
+  UnitAttr loopInclusive;
 };
 
 struct MapClauseOps {
@@ -128,11 +129,11 @@ struct MapClauseOps {
 };
 
 struct MergeableClauseOps {
-  UnitAttr mergeableAttr;
+  UnitAttr mergeable;
 };
 
 struct NogroupClauseOps {
-  UnitAttr nogroupAttr;
+  UnitAttr nogroup;
 };
 
 struct NontemporalClauseOps {
@@ -140,36 +141,36 @@ struct NontemporalClauseOps {
 };
 
 struct NowaitClauseOps {
-  UnitAttr nowaitAttr;
+  UnitAttr nowait;
 };
 
 struct NumTasksClauseOps {
-  Value numTasksVar;
+  Value numTasks;
 };
 
 struct NumTeamsClauseOps {
-  Value numTeamsLowerVar, numTeamsUpperVar;
+  Value numTeamsLower, numTeamsUpper;
 };
 
 struct NumThreadsClauseOps {
-  Value numThreadsVar;
+  Value numThreads;
 };
 
 struct OrderClauseOps {
-  ClauseOrderKindAttr orderAttr;
-  OrderModifierAttr orderModAttr;
+  ClauseOrderKindAttr order;
+  OrderModifierAttr orderMod;
 };
 
 struct OrderedClauseOps {
-  IntegerAttr orderedAttr;
+  IntegerAttr ordered;
 };
 
 struct ParallelizationLevelClauseOps {
-  UnitAttr parLevelSimdAttr;
+  UnitAttr parLevelSimd;
 };
 
 struct PriorityClauseOps {
-  Value priorityVar;
+  Value priority;
 };
 
 struct PrivateClauseOps {
@@ -179,46 +180,46 @@ struct PrivateClauseOps {
   llvm::SmallVector<Value> privateVars;
   // The list of symbols referring to delayed privatizer ops (i.e. `omp.private`
   // ops).
-  llvm::SmallVector<Attribute> privatizers;
+  llvm::SmallVector<Attribute> privateSyms;
 };
 
 struct ProcBindClauseOps {
-  ClauseProcBindKindAttr procBindKindAttr;
+  ClauseProcBindKindAttr procBindKind;
 };
 
 struct ReductionClauseOps {
   llvm::SmallVector<Value> reductionVars;
-  llvm::SmallVector<bool> reductionVarsByRef;
-  llvm::SmallVector<Attribute> reductionDeclSymbols;
+  llvm::SmallVector<bool> reductionByref;
+  llvm::SmallVector<Attribute> reductionSyms;
 };
 
 struct SafelenClauseOps {
-  IntegerAttr safelenAttr;
+  IntegerAttr safelen;
 };
 
 struct ScheduleClauseOps {
-  ClauseScheduleKindAttr scheduleValAttr;
-  ScheduleModifierAttr scheduleModAttr;
-  Value scheduleChunkVar;
-  UnitAttr scheduleSimdAttr;
+  ClauseScheduleKindAttr scheduleKind;
+  Value scheduleChunk;
+  ScheduleModifierAttr scheduleMod;
+  UnitAttr scheduleSimd;
 };
 
 struct SimdlenClauseOps {
-  IntegerAttr simdlenAttr;
+  IntegerAttr simdlen;
 };
 
 struct TaskReductionClauseOps {
   llvm::SmallVector<Value> taskReductionVars;
-  llvm::SmallVector<bool> taskReductionVarsByRef;
-  llvm::SmallVector<Attribute> taskReductionDeclSymbols;
+  llvm::SmallVector<bool> taskReductionByref;
+  llvm::SmallVector<Attribute> taskReductionSyms;
 };
 
 struct ThreadLimitClauseOps {
-  Value threadLimitVar;
+  Value threadLimit;
 };
 
 struct UntiedClauseOps {
-  UnitAttr untiedAttr;
+  UnitAttr untied;
 };
 
 struct UseDeviceAddrClauseOps {
@@ -241,82 +242,81 @@ template <typename... Mixins>
 struct Clauses : public Mixins... {};
 } // namespace detail
 
-using CancelClauseOps =
+using CancelOperands =
     detail::Clauses<CancelDirectiveNameClauseOps, IfClauseOps>;
 
-using CancellationPointClauseOps =
-    detail::Clauses<CancelDirectiveNameClauseOps>;
+using CancellationPointOperands = detail::Clauses<CancelDirectiveNameClauseOps>;
 
-using CriticalClauseOps = detail::Clauses<CriticalNameClauseOps, HintClauseOps>;
+using CriticalDeclareOperands =
+    detail::Clauses<CriticalNameClauseOps, HintClauseOps>;
 
 // TODO `indirect` clause.
-using DeclareTargetClauseOps = detail::Clauses<DeviceTypeClauseOps>;
+using DeclareTargetOperands = detail::Clauses<DeviceTypeClauseOps>;
 
-using DistributeClauseOps =
+using DistributeOperands =
     detail::Clauses<AllocateClauseOps, DistScheduleClauseOps, OrderClauseOps,
                     PrivateClauseOps>;
 
-using LoopNestClauseOps = detail::Clauses<CollapseClauseOps, LoopRelatedOps>;
+using LoopNestOperands = detail::Clauses<LoopRelatedOps>;
 
-using MaskedClauseOps = detail::Clauses<FilterClauseOps>;
+using MaskedOperands = detail::Clauses<FilterClauseOps>;
 
-using OrderedOpClauseOps = detail::Clauses<DoacrossClauseOps>;
+using OrderedOperands = detail::Clauses<DoacrossClauseOps>;
 
-using OrderedRegionClauseOps = detail::Clauses<ParallelizationLevelClauseOps>;
+using OrderedRegionOperands = detail::Clauses<ParallelizationLevelClauseOps>;
 
-using ParallelClauseOps =
+using ParallelOperands =
     detail::Clauses<AllocateClauseOps, IfClauseOps, NumThreadsClauseOps,
                     PrivateClauseOps, ProcBindClauseOps, ReductionClauseOps>;
 
-using SectionsClauseOps = detail::Clauses<AllocateClauseOps, NowaitClauseOps,
-                                          PrivateClauseOps, ReductionClauseOps>;
+using SectionsOperands = detail::Clauses<AllocateClauseOps, NowaitClauseOps,
+                                         PrivateClauseOps, ReductionClauseOps>;
 
-// TODO `linear` clause.
-using SimdClauseOps =
-    detail::Clauses<AlignedClauseOps, IfClauseOps, NontemporalClauseOps,
-                    OrderClauseOps, PrivateClauseOps, ReductionClauseOps,
-                    SafelenClauseOps, SimdlenClauseOps>;
+using SimdOperands =
+    detail::Clauses<AlignedClauseOps, IfClauseOps, LinearClauseOps,
+                    NontemporalClauseOps, OrderClauseOps, PrivateClauseOps,
+                    ReductionClauseOps, SafelenClauseOps, SimdlenClauseOps>;
 
-using SingleClauseOps = detail::Clauses<AllocateClauseOps, CopyprivateClauseOps,
-                                        NowaitClauseOps, PrivateClauseOps>;
+using SingleOperands = detail::Clauses<AllocateClauseOps, CopyprivateClauseOps,
+                                       NowaitClauseOps, PrivateClauseOps>;
 
 // TODO `defaultmap`, `uses_allocators` clauses.
-using TargetClauseOps =
+using TargetOperands =
     detail::Clauses<AllocateClauseOps, DependClauseOps, DeviceClauseOps,
                     HasDeviceAddrClauseOps, IfClauseOps, InReductionClauseOps,
                     IsDevicePtrClauseOps, MapClauseOps, NowaitClauseOps,
                     PrivateClauseOps, ThreadLimitClauseOps>;
 
-using TargetDataClauseOps =
+using TargetDataOperands =
     detail::Clauses<DeviceClauseOps, IfClauseOps, MapClauseOps,
                     UseDeviceAddrClauseOps, UseDevicePtrClauseOps>;
 
-using TargetEnterExitUpdateDataClauseOps =
+using TargetEnterExitUpdateDataOperands =
     detail::Clauses<DependClauseOps, DeviceClauseOps, IfClauseOps, MapClauseOps,
                     NowaitClauseOps>;
 
 // TODO `affinity`, `detach` clauses.
-using TaskClauseOps =
+using TaskOperands =
     detail::Clauses<AllocateClauseOps, DependClauseOps, FinalClauseOps,
                     IfClauseOps, InReductionClauseOps, MergeableClauseOps,
                     PriorityClauseOps, PrivateClauseOps, UntiedClauseOps>;
 
-using TaskgroupClauseOps =
+using TaskgroupOperands =
     detail::Clauses<AllocateClauseOps, TaskReductionClauseOps>;
 
-using TaskloopClauseOps =
+using TaskloopOperands =
     detail::Clauses<AllocateClauseOps, FinalClauseOps, GrainsizeClauseOps,
                     IfClauseOps, InReductionClauseOps, MergeableClauseOps,
                     NogroupClauseOps, NumTasksClauseOps, PriorityClauseOps,
                     PrivateClauseOps, ReductionClauseOps, UntiedClauseOps>;
 
-using TaskwaitClauseOps = detail::Clauses<DependClauseOps, NowaitClauseOps>;
+using TaskwaitOperands = detail::Clauses<DependClauseOps, NowaitClauseOps>;
 
-using TeamsClauseOps =
+using TeamsOperands =
     detail::Clauses<AllocateClauseOps, IfClauseOps, NumTeamsClauseOps,
                     PrivateClauseOps, ReductionClauseOps, ThreadLimitClauseOps>;
 
-using WsloopClauseOps =
+using WsloopOperands =
     detail::Clauses<AllocateClauseOps, LinearClauseOps, NowaitClauseOps,
                     OrderClauseOps, OrderedClauseOps, PrivateClauseOps,
                     ReductionClauseOps, ScheduleClauseOps>;
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
index 5b201687b4ea34..e703c323edbc8a 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
@@ -20,6 +20,7 @@
 #define OPENMP_CLAUSES
 
 include "mlir/Dialect/OpenMP/OpenMPOpBase.td"
+include "mlir/IR/SymbolInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // V5.2: [5.11] `aligned` clause
@@ -32,18 +33,18 @@ class OpenMP_AlignedClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$aligned_vars,
-    OptionalAttr<I64ArrayAttr>:$alignment_values
+    OptionalAttr<I64ArrayAttr>:$alignments
   );
 
   let assemblyFormat = [{
     `aligned` `(` custom<AlignedClause>($aligned_vars, type($aligned_vars),
-                                        $alignment_values) `)`
+                                        $alignments) `)`
   }];
 
   let description = [{
-    The `alignment_values` attribute additionally specifies alignment of each
-    corresponding aligned operand. Note that `aligned_vars` and
-    `alignment_values` should contain the same number of elements.
+    The `alignments` attribute additionally specifies alignment of each
+    corresponding aligned operand. Note that `aligned_vars` and `alignments`
+    must contain the same number of elements.
   }];
 }
 
@@ -60,22 +61,22 @@ class OpenMP_AllocateClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<AnyType>:$allocate_vars,
-    Variadic<AnyType>:$allocators_vars
+    Variadic<AnyType>:$allocator_vars
   );
 
   let extraClassDeclaration = [{
     unsigned getNumAllocateVars() { return getAllocateVars().size(); }
-    unsigned getNumAllocatorsVars() { return getAllocatorsVars().size(); }
+    unsigned getNumAllocatorsVars() { return getAllocatorVars().size(); }
   }];
 
   let assemblyFormat = [{
     `allocate` `(`
       custom<AllocateAndAllocator>($allocate_vars, type($allocate_vars),
-                                   $allocators_vars, type($allocators_vars)) `)`
+                                   $allocator_vars, type($allocator_vars)) `)`
   }];
 
   let description = [{
-    The `allocators_vars` and `allocate_vars` parameters are a variadic list of
+    The `allocator_vars` and `allocate_vars` parameters are a variadic list of
     values that specify the memory allocator to be used to obtain storage for
     private values.
   }];
@@ -93,12 +94,12 @@ class OpenMP_CancelDirectiveNameClauseSkip<
   > : OpenMP_Clause</*isRequired=*/true, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    CancellationConstructTypeAttr:$cancellation_construct_type_val
+    CancellationConstructTypeAttr:$cancel_directive
   );
 
   let assemblyFormat = [{
     `cancellation_construct_type` `(`
-      custom<ClauseAttr>($cancellation_construct_type_val) `)`
+      custom<ClauseAttr>($cancel_directive) `)`
   }];
 
   // TODO: Add description.
@@ -106,36 +107,6 @@ class OpenMP_CancelDirectiveNameClauseSkip<
 
 def OpenMP_CancelDirectiveNameClause : OpenMP_CancelDirectiveNameClauseSkip<>;
 
-//===----------------------------------------------------------------------===//
-// V5.2: [4.4.3] `collapse` clause
-//===----------------------------------------------------------------------===//
-
-class OpenMP_CollapseClauseSkip<
-    bit traits = false, bit arguments = false, bit assemblyFormat = false,
-    bit description = false, bit extraClassDeclaration = false
-  > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
-                    description, extraClassDeclaration> {
-  let traits = [
-    AllTypesMatch<["lowerBound", "upperBound", "step"]>
-  ];
-
-  let arguments = (ins
-    Variadic<IntLikeType>:$lowerBound,
-    Variadic<IntLikeType>:$upperBound,
-    Variadic<IntLikeType>:$step
-  );
-
-  let extraClassDeclaration = [{
-    /// Returns the number of loops in the loop nest.
-    unsigned getNumLoops() { return getLowerBound().size(); }
-  }];
-
-  // Description and formatting integrated in the `omp.loop_nest` operation,
-  // which is the only one currently accepting this clause.
-}
-
-def OpenMP_CollapseClause : OpenMP_CollapseClauseSkip<>;
-
 //===----------------------------------------------------------------------===//
 // V5.2: [5.7.2] `copyprivate` clause
 //===----------------------------------------------------------------------===//
@@ -147,13 +118,13 @@ class OpenMP_CopyprivateClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$copyprivate_vars,
-    OptionalAttr<SymbolRefArrayAttr>:$copyprivate_funcs
+    OptionalAttr<SymbolRefArrayAttr>:$copyprivate_syms
   );
 
   let assemblyFormat = [{
     `copyprivate` `(`
-      custom<CopyPrivateVarList>($copyprivate_vars, type($copyprivate_vars),
-                                 $copyprivate_funcs) `)`
+      custom<Copyprivate>($copyprivate_vars, type($copyprivate_vars),
+                          $copyprivate_syms) `)`
   }];
 
   let description = [{
@@ -174,6 +145,10 @@ class OpenMP_CriticalNameClauseSkip<
     bit description = false, bit extraClassDeclaration = false
   > : OpenMP_Clause</*isRequired=*/true, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
+  let traits = [
+    Symbol
+  ];
+
   let arguments = (ins
     SymbolNameAttr:$sym_name
   );
@@ -197,18 +172,19 @@ class OpenMP_DependClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<TaskDependArrayAttr>:$depends,
+    OptionalAttr<TaskDependArrayAttr>:$depend_kinds,
     Variadic<OpenMP_PointerLikeType>:$depend_vars
   );
 
   let assemblyFormat = [{
     `depend` `(`
-      custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
+      custom<DependVarList>($depend_vars, type($depend_vars), $depend_kinds) `)`
   }];
 
   let description = [{
-    The `depends` and `depend_vars` arguments are variadic lists of values that
-    specify the dependencies of this particular task in relation to other tasks.
+    The `depend_kinds` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular task in relation to other
+    tasks.
   }];
 }
 
@@ -250,19 +226,20 @@ class OpenMP_DistScheduleClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     UnitAttr:$dist_schedule_static,
-    Optional<IntLikeType>:$chunk_size
+    Optional<IntLikeType>:$dist_schedule_chunk_size
   );
 
   let assemblyFormat = [{
     `dist_schedule_static` $dist_schedule_static
-    | `chunk_size` `(` $chunk_size `:` type($chunk_size) `)`
+    | `dist_schedule_chunk_size` `(` $dist_schedule_chunk_size `:`
+      type($dist_schedule_chunk_size) `)`
   }];
 
   let description = [{
     The `dist_schedule_static` attribute specifies the schedule for this loop,
     determining how the loop is distributed across the various teams. The
-    optional `chunk_size` associated with this determines further controls this
-    distribution.
+    optional `dist_schedule_chunk_size` associated with this determines further
+    controls this distribution.
   }];
 }
 
@@ -278,24 +255,25 @@ class OpenMP_DoacrossClauseSkip<
   > : OpenMP_Clause</*isRequired=*/true, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<ClauseDependAttr>:$depend_type_val,
-    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$num_loops_val,
-    Variadic<AnyType>:$depend_vec_vars
+    OptionalAttr<ClauseDependAttr>:$doacross_depend_type,
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$doacross_num_loops,
+    Variadic<AnyType>:$doacross_depend_vars
   );
 
   let assemblyFormat = [{
-    ( `depend_type` `` $depend_type_val^ )?
-    ( `depend_vec` `(` $depend_vec_vars^ `:` type($depend_vec_vars) `)` )?
+    ( `depend_type` `` $doacross_depend_type^ )?
+    ( `depend_vec` `(` $doacross_depend_vars^ `:` type($doacross_depend_vars)
+                   `)` )?
   }];
 
   let description = [{
-    The `depend_type_val` attribute refers to either the DEPEND(SOURCE) clause
-    or the DEPEND(SINK: vec) clause.
+    The `doacross_depend_type` attribute refers to either the DEPEND(SOURCE)
+    clause or the DEPEND(SINK: vec) clause.
 
-    The `num_loops_val` attribute specifies the number of loops in the doacross
-    nest.
+    The `doacross_num_loops` attribute specifies the number of loops in the
+    doacross nest.
 
-    The `depend_vec_vars` is a variadic list of operands that specifies the
+    The `doacross_depend_vars` is a variadic list of operands that specifies the
     index of the loop iterator in the doacross nest for the DEPEND(SOURCE)
     clause or the index of the element of "vec" for the DEPEND(SINK: vec)
     clause. It contains the operands in multiple "vec" when multiple
@@ -343,11 +321,11 @@ class OpenMP_FinalClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Optional<I1>:$final_expr
+    Optional<I1>:$final
   );
 
   let assemblyFormat = [{
-    `final` `(` $final_expr `)`
+    `final` `(` $final `)`
   }];
 
   let description = [{
@@ -371,11 +349,11 @@ class OpenMP_GrainsizeClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Optional<IntLikeType>:$grain_size
+    Optional<IntLikeType>:$grainsize
   );
 
   let assemblyFormat = [{
-    `grain_size` `(` $grain_size `:` type($grain_size) `)`
+    `grainsize` `(` $grainsize `:` type($grainsize) `)`
   }];
 
   let description = [{
@@ -398,17 +376,18 @@ class OpenMP_HasDeviceAddrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$has_device_addr
+    Variadic<OpenMP_PointerLikeType>:$has_device_addr_vars
   );
 
   let assemblyFormat = [{
-    `has_device_addr` `(` $has_device_addr `:` type($has_device_addr) `)`
+    `has_device_addr` `(` $has_device_addr_vars `:` type($has_device_addr_vars)
+                      `)`
   }];
 
   let description = [{
-    The optional `has_device_addr` indicates that list items already have device
-    addresses, so they may be directly accessed from the target device. This
-    includes array sections.
+    The optional `has_device_addr_vars` indicates that list items already have
+    device addresses, so they may be directly accessed from the target device.
+    This includes array sections.
   }];
 }
 
@@ -424,11 +403,11 @@ class OpenMP_HintClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    DefaultValuedOptionalAttr<I64Attr, "0">:$hint_val
+    DefaultValuedOptionalAttr<I64Attr, "0">:$hint
   );
 
   let assemblyFormat = [{
-    `hint` `(` custom<SynchronizationHint>($hint_val) `)`
+    `hint` `(` custom<SynchronizationHint>($hint) `)`
   }];
 
   let description = [{
@@ -477,14 +456,14 @@ class OpenMP_InReductionClauseSkip<
 
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$in_reduction_vars,
-    OptionalAttr<DenseBoolArrayAttr>:$in_reduction_vars_byref,
-    OptionalAttr<SymbolRefArrayAttr>:$in_reductions
+    OptionalAttr<DenseBoolArrayAttr>:$in_reduction_byref,
+    OptionalAttr<SymbolRefArrayAttr>:$in_reduction_syms
   );
 
   let assemblyFormat = [{
     `in_reduction` `(`
       custom<ReductionVarList>($in_reduction_vars, type($in_reduction_vars),
-                               $in_reduction_vars_byref, $in_reductions) `)`
+                               $in_reduction_byref, $in_reduction_syms) `)`
   }];
 
   let extraClassDeclaration = [{
@@ -510,15 +489,15 @@ class OpenMP_IsDevicePtrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$is_device_ptr
+    Variadic<OpenMP_PointerLikeType>:$is_device_ptr_vars
   );
 
   let assemblyFormat = [{
-    `is_device_ptr` `(` $is_device_ptr `:` type($is_device_ptr) `)`
+    `is_device_ptr` `(` $is_device_ptr_vars `:` type($is_device_ptr_vars) `)`
   }];
 
   let description = [{
-    The optional `is_device_ptr` indicates list items are device pointers.
+    The optional `is_device_ptr_vars` indicates list items are device pointers.
   }];
 }
 
@@ -554,6 +533,38 @@ class OpenMP_LinearClauseSkip<
 
 def OpenMP_LinearClause : OpenMP_LinearClauseSkip<>;
 
+//===----------------------------------------------------------------------===//
+// Not in the spec: Clause-like structure to hold loop related information.
+//===----------------------------------------------------------------------===//
+
+class OpenMP_LoopRelatedClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
+                    description, extraClassDeclaration> {
+  let traits = [
+    AllTypesMatch<
+      ["loop_lower_bounds", "loop_upper_bounds", "loop_steps"]>
+  ];
+  
+  let arguments = (ins
+    Variadic<IntLikeType>:$loop_lower_bounds,
+    Variadic<IntLikeType>:$loop_upper_bounds,
+    Variadic<IntLikeType>:$loop_steps,
+    UnitAttr:$loop_inclusive
+  );
+
+  let extraClassDeclaration = [{
+    /// Returns the number of loops in the loop nest.
+    unsigned getNumLoops() { return getLoopLowerBounds().size(); }
+  }];
+
+  // Description and formatting integrated in the `omp.loop_nest` operation,
+  // which is the only one currently accepting this clause.
+}
+
+def OpenMP_LoopRelatedClause : OpenMP_LoopRelatedClauseSkip<>;
+
 //===----------------------------------------------------------------------===//
 // V5.2: [5.8.3] `map` clause
 //===----------------------------------------------------------------------===//
@@ -568,16 +579,16 @@ class OpenMP_MapClauseSkip<
   ];
 
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$map_operands
+    Variadic<OpenMP_PointerLikeType>:$map_vars
   );
 
   let assemblyFormat = [{
-    `map_entries` `(` custom<MapEntries>($map_operands, type($map_operands)) `)`
+    `map_entries` `(` custom<MapEntries>($map_vars, type($map_vars)) `)`
   }];
 
   let description = [{
-    The optional `map_operands` maps data from the current task's data
-    environment to the device data environment.
+    The optional `map_vars` maps data from the current task's data environment
+    to the device data environment.
   }];
 }
 
@@ -593,11 +604,11 @@ class OpenMP_MemoryOrderClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<MemoryOrderKindAttr>:$memory_order_val
+    OptionalAttr<MemoryOrderKindAttr>:$memory_order
   );
 
   let assemblyFormat = [{
-    `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`
+    `memory_order` `(` custom<ClauseAttr>($memory_order) `)`
   }];
 
   let description = [{
@@ -779,16 +790,16 @@ class OpenMP_NumThreadsClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Optional<IntLikeType>:$num_threads_var
+    Optional<IntLikeType>:$num_threads
   );
 
   let assemblyFormat = [{
-    `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
+    `num_threads` `(` $num_threads `:` type($num_threads) `)`
   }];
 
   let description = [{
-    The optional `num_threads_var` parameter specifies the number of threads
-    which should be used to execute the parallel region.
+    The optional `num_threads` parameter specifies the number of threads which
+    should be used to execute the parallel region.
   }];
 }
 
@@ -804,12 +815,12 @@ class OpenMP_OrderClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<OrderKindAttr>:$order_val,
+    OptionalAttr<OrderKindAttr>:$order,
     OptionalAttr<OrderModifierAttr>:$order_mod
   );
 
   let assemblyFormat = [{
-    `order` `(` custom<OrderClause>($order_val, $order_mod) `)`
+    `order` `(` custom<OrderClause>($order, $order_mod) `)`
   }];
 
   let description = [{
@@ -831,15 +842,15 @@ class OpenMP_OrderedClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered
   );
 
   let assemblyFormat = [{
-    `ordered` `(` $ordered_val `)`
+    `ordered` `(` $ordered `)`
   }];
 
   let description = [{
-    The optional `ordered_val` attribute specifies how many loops are associated
+    The optional `ordered` attribute specifies how many loops are associated
     with the worksharing-loop construct. The value of zero refers to the ordered
     clause specified without parameter.
   }];
@@ -857,17 +868,17 @@ class OpenMP_ParallelizationLevelClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    UnitAttr:$simd
+    UnitAttr:$par_level_simd
   );
 
   let assemblyFormat = [{
-    `simd` $simd
+    `par_level_simd` $par_level_simd
   }];
 
   let description = [{
-    The `simd` attribute corresponds to the simd clause specified. If it is not
-    present, it behaves as if the threads clause is specified or no clause is
-    specified.
+    The `par_level_simd` attribute corresponds to the simd clause specified. If
+    it is not present, it behaves as if the threads clause is specified or no
+    clause is specified.
   }];
 }
 
@@ -914,12 +925,12 @@ class OpenMP_PrivateClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<AnyType>:$private_vars,
-    OptionalAttr<SymbolRefArrayAttr>:$privatizers
+    OptionalAttr<SymbolRefArrayAttr>:$private_syms
   );
 
   let assemblyFormat = [{
     `private` `(`
-      custom<PrivateList>($private_vars, type($private_vars), $privatizers) `)`
+      custom<PrivateList>($private_vars, type($private_vars), $private_syms) `)`
   }];
 
   // TODO: Add description.
@@ -937,15 +948,15 @@ class OpenMP_ProcBindClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<ProcBindKindAttr>:$proc_bind_val
+    OptionalAttr<ProcBindKindAttr>:$proc_bind_kind
   );
 
   let assemblyFormat = [{
-    `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+    `proc_bind` `(` custom<ClauseAttr>($proc_bind_kind) `)`
   }];
 
   let description = [{
-    The optional `proc_bind_val` attribute controls the thread affinity for the
+    The optional `proc_bind_kind` attribute controls the thread affinity for the
     execution of the parallel region.
   }];
 }
@@ -967,14 +978,14 @@ class OpenMP_ReductionClauseSkip<
 
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$reduction_vars,
-    OptionalAttr<DenseBoolArrayAttr>:$reduction_vars_byref,
-    OptionalAttr<SymbolRefArrayAttr>:$reductions
+    OptionalAttr<DenseBoolArrayAttr>:$reduction_byref,
+    OptionalAttr<SymbolRefArrayAttr>:$reduction_syms
   );
 
   let assemblyFormat = [{
     `reduction` `(`
       custom<ReductionVarList>($reduction_vars, type($reduction_vars),
-                               $reduction_vars_byref, $reductions) `)`
+                               $reduction_byref, $reduction_syms) `)`
   }];
 
   let extraClassDeclaration = [{
@@ -986,10 +997,10 @@ class OpenMP_ReductionClauseSkip<
   let description = [{
     Reductions can be performed by specifying reduction accumulator variables in
     `reduction_vars`, symbols referring to reduction declarations in the
-    `reductions` attribute, and whether the reduction variable should be passed
-    into the reduction region by value or by reference in
-    `reduction_vars_byref`. Each reduction is identified by the accumulator it
-    uses and accumulators must not be repeated in the same reduction. A private
+    `reduction_syms` attribute, and whether the reduction variable should be
+    passed into the reduction region by value or by reference in
+    `reduction_byref`. Each reduction is identified by the accumulator it uses
+    and accumulators must not be repeated in the same reduction. A private
     variable corresponding to the accumulator is used in place of the
     accumulator inside the body of the operation. The reduction declaration
     specifies how to combine the values from each iteration, section, team,
@@ -1036,22 +1047,22 @@ class OpenMP_ScheduleClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<ScheduleKindAttr>:$schedule_val,
-    Optional<AnyType>:$schedule_chunk_var,
-    OptionalAttr<ScheduleModifierAttr>:$schedule_modifier,
-    UnitAttr:$simd_modifier
+    OptionalAttr<ScheduleKindAttr>:$schedule_kind,
+    Optional<AnyType>:$schedule_chunk,
+    OptionalAttr<ScheduleModifierAttr>:$schedule_mod,
+    UnitAttr:$schedule_simd
   );
 
   let assemblyFormat = [{
     `schedule` `(`
-      custom<ScheduleClause>($schedule_val, $schedule_modifier, $simd_modifier,
-                             $schedule_chunk_var, type($schedule_chunk_var)) `)`
+      custom<ScheduleClause>($schedule_kind, $schedule_mod, $schedule_simd,
+                             $schedule_chunk, type($schedule_chunk)) `)`
   }];
 
   let description = [{
-    The optional `schedule_val` attribute specifies the loop schedule for this
+    The optional `schedule_kind` attribute specifies the loop schedule for this
     loop, determining how the loop is distributed across the parallel threads.
-    The optional `schedule_chunk_var` associated with this determines further
+    The optional `schedule_chunk` associated with this determines further
     controls this distribution.
   }];
 }
@@ -1098,14 +1109,14 @@ class OpenMP_TaskReductionClauseSkip<
 
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$task_reduction_vars,
-    OptionalAttr<DenseBoolArrayAttr>:$task_reduction_vars_byref,
-    OptionalAttr<SymbolRefArrayAttr>:$task_reductions
+    OptionalAttr<DenseBoolArrayAttr>:$task_reduction_byref,
+    OptionalAttr<SymbolRefArrayAttr>:$task_reduction_syms
   );
 
   let assemblyFormat = [{
     `task_reduction` `(`
       custom<ReductionVarList>($task_reduction_vars, type($task_reduction_vars),
-                               $task_reduction_vars_byref, $task_reductions) `)`
+                               $task_reduction_byref, $task_reduction_syms) `)`
   }];
 
   let description = [{
@@ -1115,9 +1126,9 @@ class OpenMP_TaskReductionClauseSkip<
     participating in the reduction. After the end of the region, the original
     list item contains the result of the reduction. Similarly to the `reduction`
     clause, accumulator variables must be passed in `task_reduction_vars`,
-    symbols referring to reduction declarations in the `task_reductions`
+    symbols referring to reduction declarations in the `task_reduction_syms`
     attribute, and whether the reduction variable should be passed into the
-    reduction region by value or by reference in `task_reduction_vars_byref`.
+    reduction region by value or by reference in `task_reduction_byref`.
   }];
 
   let extraClassDeclaration = [{
@@ -1176,7 +1187,7 @@ class OpenMP_UntiedClauseSkip<
     If the `untied` clause is present on a task construct, any thread in the
     team can resume the task region after a suspension. The `untied` clause is
     ignored if a `final` clause is present on the same task construct and the
-    `final_expr` evaluates to `true`, or if a task is an included task.
+    `final` expression evaluates to `true`, or if a task is an included task.
   }];
 }
 
@@ -1192,16 +1203,16 @@ class OpenMP_UseDeviceAddrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$use_device_addr
+    Variadic<OpenMP_PointerLikeType>:$use_device_addr_vars
   );
 
   let assemblyFormat = [{
-    `use_device_addr` `(` $use_device_addr `:` type($use_device_addr) `)`
+    `use_device_addr` `(` $use_device_addr_vars `:` type($use_device_addr_vars) `)`
   }];
 
   let description = [{
-    The optional `use_device_addr` specifies the address of the objects in the
-    device data environment.
+    The optional `use_device_addr_vars` specifies the address of the objects in
+    the device data environment.
   }];
 }
 
@@ -1217,15 +1228,15 @@ class OpenMP_UseDevicePtrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$use_device_ptr
+    Variadic<OpenMP_PointerLikeType>:$use_device_ptr_vars
   );
 
   let assemblyFormat = [{
-    `use_device_ptr` `(` $use_device_ptr `:` type($use_device_ptr) `)`
+    `use_device_ptr` `(` $use_device_ptr_vars `:` type($use_device_ptr_vars) `)`
   }];
 
   let description = [{
-    The optional `use_device_ptr` specifies the device pointers to the
+    The optional `use_device_ptr_vars` specifies the device pointers to the
     corresponding list items in the device data environment.
   }];
 }
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 69fd1f1f0130fd..16b6f624ce042b 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -145,15 +145,14 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [
     The parallel construct includes a region of code which is to be executed
     by a team of threads.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the parallel
-    region runs as normal, if it is 0 then the parallel region is executed with
-    one thread.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the parallel region runs
+    as normal, if it is 0 then the parallel region is executed with one thread.
   }] # clausesDescription;
 
   let builders = [
     OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    OpBuilder<(ins CArg<"const ParallelClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const ParallelOperands &">:$clauses)>
   ];
 
   // TODO: Use default assembly format inherited from OpenMP_Op once printing
@@ -163,16 +162,16 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [
   let assemblyFormat = [{
     oilist(
           `if` `(` $if_expr `)`
-          | `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
+          | `num_threads` `(` $num_threads `:` type($num_threads) `)`
           | `allocate` `(`
               custom<AllocateAndAllocator>(
                 $allocate_vars, type($allocate_vars),
-                $allocators_vars, type($allocators_vars)
+                $allocator_vars, type($allocator_vars)
               ) `)`
-          | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+          | `proc_bind` `(` custom<ClauseAttr>($proc_bind_kind) `)`
     ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars),
-                             $reduction_vars_byref, $reductions, $private_vars,
-                             type($private_vars), $privatizers) attr-dict
+                             $reduction_byref, $reduction_syms, $private_vars,
+                             type($private_vars), $private_syms) attr-dict
   }];
 
   let hasVerifier = 1;
@@ -196,10 +195,9 @@ def TerminatorOp : OpenMP_Op<"terminator", [Terminator, Pure]> {
 def TeamsOp : OpenMP_Op<"teams", traits = [
     AttrSizedOperandSegments, RecursiveMemoryEffects
   ], clauses = [
-    // TODO: Complete clause list (private).
     // TODO: Sort clauses alphabetically.
     OpenMP_NumTeamsClause, OpenMP_IfClause, OpenMP_ThreadLimitClause,
-    OpenMP_AllocateClause, OpenMP_ReductionClause
+    OpenMP_AllocateClause, OpenMP_ReductionClause, OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "teams construct";
   let description = [{
@@ -212,7 +210,7 @@ def TeamsOp : OpenMP_Op<"teams", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TeamsClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TeamsOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -239,9 +237,9 @@ def SectionOp : OpenMP_Op<"section", [HasParent<"SectionsOp">],
 def SectionsOp : OpenMP_Op<"sections", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Complete clause list (private).
     // TODO: Sort clauses alphabetically.
-    OpenMP_ReductionClause, OpenMP_AllocateClause, OpenMP_NowaitClause
+    OpenMP_ReductionClause, OpenMP_AllocateClause, OpenMP_NowaitClause,
+    OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "sections construct";
   let description = [{
@@ -258,7 +256,7 @@ def SectionsOp : OpenMP_Op<"sections", traits = [
   let regions = (region SizedRegion<1>:$region);
 
   let builders = [
-    OpBuilder<(ins CArg<"const SectionsClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SectionsOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -272,8 +270,8 @@ def SectionsOp : OpenMP_Op<"sections", traits = [
 def SingleOp : OpenMP_Op<"single", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Complete clause list (private).
-    OpenMP_AllocateClause, OpenMP_CopyprivateClause, OpenMP_NowaitClause
+    OpenMP_AllocateClause, OpenMP_CopyprivateClause, OpenMP_NowaitClause,
+    OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "single directive";
   let description = [{
@@ -285,7 +283,7 @@ def SingleOp : OpenMP_Op<"single", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const SingleClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SingleOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -298,7 +296,7 @@ def SingleOp : OpenMP_Op<"single", traits = [
 def LoopNestOp : OpenMP_Op<"loop_nest", traits = [
     RecursiveMemoryEffects, SameVariadicOperandSize
   ], clauses = [
-    OpenMP_CollapseClause
+    OpenMP_LoopRelatedClause
   ], singleRegion = true> {
   let summary = "rectangular loop nest";
   let description = [{
@@ -307,14 +305,14 @@ def LoopNestOp : OpenMP_Op<"loop_nest", traits = [
     lower and upper bounds, as well as a step variable, must be defined.
 
     The lower and upper bounds specify a half-open range: the range includes the
-    lower bound but does not include the upper bound. If the `inclusive`
+    lower bound but does not include the upper bound. If the `loop_inclusive`
     attribute is specified then the upper bound is also included.
 
     The body region can contain any number of blocks. The region is terminated
     by an `omp.yield` instruction without operands. The induction variables,
     represented as entry block arguments to the loop nest operation's single
-    region, match the types of the `lowerBound`, `upperBound` and `step`
-    arguments.
+    region, match the types of the `loop_lower_bounds`, `loop_upper_bounds` and
+    `loop_steps` arguments.
 
     ```mlir
     omp.loop_nest (%i1, %i2) : i32 = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
@@ -336,10 +334,8 @@ def LoopNestOp : OpenMP_Op<"loop_nest", traits = [
     non-perfectly nested loops.
   }];
 
-  let arguments = !con(clausesArgs, (ins UnitAttr:$inclusive));
-
   let builders = [
-    OpBuilder<(ins CArg<"const LoopNestClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const LoopNestOperands &">:$clauses)>
   ];
 
   let extraClassDeclaration = [{
@@ -366,14 +362,15 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopWrapperInterface>,
     RecursiveMemoryEffects, SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (allocate, private).
     // TODO: Sort clauses alphabetically.
     OpenMP_LinearClauseSkip<assemblyFormat = true>,
     OpenMP_ReductionClauseSkip<assemblyFormat = true>,
     OpenMP_ScheduleClauseSkip<assemblyFormat = true>,
     OpenMP_NowaitClauseSkip<assemblyFormat = true>,
     OpenMP_OrderedClauseSkip<assemblyFormat = true>,
-    OpenMP_OrderClauseSkip<assemblyFormat = true>
+    OpenMP_OrderClauseSkip<assemblyFormat = true>,
+    OpenMP_AllocateClauseSkip<assemblyFormat = true>,
+    OpenMP_PrivateClauseSkip<assemblyFormat = true>
   ], singleRegion = true> {
   let summary = "worksharing-loop construct";
   let description = [{
@@ -402,7 +399,7 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
 
   let builders = [
     OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    OpBuilder<(ins CArg<"const WsloopClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const WsloopOperands &">:$clauses)>
   ];
 
   // TODO: Use default assembly format inherited from OpenMP_Op once printing
@@ -415,13 +412,20 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
                                    $linear_step_vars) `)`
           |`schedule` `(`
               custom<ScheduleClause>(
-                $schedule_val, $schedule_modifier, $simd_modifier,
-                $schedule_chunk_var, type($schedule_chunk_var)) `)`
+                $schedule_kind, $schedule_mod, $schedule_simd,
+                $schedule_chunk, type($schedule_chunk)) `)`
           |`nowait` $nowait
-          |`ordered` `(` $ordered_val `)`
-          |`order` `(` custom<OrderClause>($order_val, $order_mod) `)`
+          |`ordered` `(` $ordered `)`
+          |`order` `(` custom<OrderClause>($order, $order_mod) `)`
+          |`allocate` `(`
+                custom<AllocateAndAllocator>(
+                  $allocate_vars, type($allocate_vars), $allocator_vars,
+                  type($allocator_vars)) `)`
+          |`private` `(`
+                custom<PrivateList>(
+                  $private_vars, type($private_vars), $private_syms) `)`
     ) custom<Wsloop>($region, $reduction_vars, type($reduction_vars),
-                     $reduction_vars_byref, $reductions) attr-dict
+                     $reduction_byref, $reduction_syms) attr-dict
   }];
 
   let hasVerifier = 1;
@@ -435,9 +439,9 @@ def SimdOp : OpenMP_Op<"simd", traits = [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopWrapperInterface>,
     RecursiveMemoryEffects, SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (linear, private, reduction).
-    OpenMP_AlignedClause, OpenMP_IfClause, OpenMP_NontemporalClause,
-    OpenMP_OrderClause, OpenMP_SafelenClause, OpenMP_SimdlenClause
+    OpenMP_AlignedClause, OpenMP_IfClause, OpenMP_LinearClause,
+    OpenMP_NontemporalClause, OpenMP_OrderClause, OpenMP_PrivateClause,
+    OpenMP_ReductionClause, OpenMP_SafelenClause, OpenMP_SimdlenClause
   ], singleRegion = true> {
   let summary = "simd construct";
   let description = [{
@@ -468,7 +472,7 @@ def SimdOp : OpenMP_Op<"simd", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const SimdClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SimdOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -502,9 +506,9 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopWrapperInterface>,
     RecursiveMemoryEffects, SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (private).
     // TODO: Sort clauses alphabetically.
-    OpenMP_DistScheduleClause, OpenMP_AllocateClause, OpenMP_OrderClause
+    OpenMP_DistScheduleClause, OpenMP_AllocateClause, OpenMP_OrderClause,
+    OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "distribute construct";
   let description = [{
@@ -541,7 +545,7 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const DistributeClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const DistributeOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -555,11 +559,12 @@ def TaskOp : OpenMP_Op<"task", traits = [
     AttrSizedOperandSegments, AutomaticAllocationScope,
     OutlineableOpenMPOpInterface
   ], clauses = [
-    // TODO: Complete clause list (affinity, detach, private).
+    // TODO: Complete clause list (affinity, detach).
     // TODO: Sort clauses alphabetically.
     OpenMP_IfClause, OpenMP_FinalClause, OpenMP_UntiedClause,
     OpenMP_MergeableClause, OpenMP_InReductionClause,
-    OpenMP_PriorityClause, OpenMP_DependClause, OpenMP_AllocateClause
+    OpenMP_PriorityClause, OpenMP_DependClause, OpenMP_AllocateClause,
+    OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "task construct";
   let description = [{
@@ -576,12 +581,12 @@ def TaskOp : OpenMP_Op<"task", traits = [
 
     The `in_reduction` clause specifies that this particular task (among all the
     tasks in current taskgroup, if any) participates in a reduction.
-    `in_reduction_vars_byref` indicates whether each reduction variable should
+    `in_reduction_byref` indicates whether each reduction variable should
     be passed by value or by reference.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -592,14 +597,13 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
     DeclareOpInterfaceMethods<LoopWrapperInterface>, RecursiveMemoryEffects,
     SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (private).
     // TODO: Sort clauses alphabetically.
     OpenMP_IfClause, OpenMP_FinalClause, OpenMP_UntiedClause,
     OpenMP_MergeableClause,
     OpenMP_InReductionClauseSkip<extraClassDeclaration = true>,
     OpenMP_ReductionClauseSkip<extraClassDeclaration = true>,
     OpenMP_PriorityClause, OpenMP_AllocateClause, OpenMP_GrainsizeClause,
-    OpenMP_NumTasksClause, OpenMP_NogroupClause
+    OpenMP_NumTasksClause, OpenMP_NogroupClause, OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "taskloop construct";
   let description = [{
@@ -639,7 +643,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
     items is present. Thus, the generated tasks are participants of a reduction
     previously defined by a reduction scoping clause. In this case, accumulator
     variables are specified in `in_reduction_vars`, symbols referring to
-    reduction declarations in `in_reductions` and `in_reduction_vars_byref`
+    reduction declarations in `in_reduction_syms` and `in_reduction_byref`
     indicate for each reduction variable whether it should be passed by value or
     by reference.
 
@@ -654,7 +658,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
   }];
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskloopClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskloopOperands &">:$clauses)>
   ];
 
   let extraClassDeclaration = [{
@@ -688,7 +692,7 @@ def TaskgroupOp : OpenMP_Op<"taskgroup", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskgroupClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskgroupOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -965,14 +969,13 @@ def TargetDataOp: OpenMP_Op<"target_data", traits = [
     to and from the offloading device when multiple target regions are using
     the same data.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed
-    on the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -995,14 +998,13 @@ def TargetEnterDataOp: OpenMP_Op<"target_enter_data", traits = [
     a device data environment. The target enter data directive is a
     stand-alone directive.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed on
-    the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1025,14 +1027,13 @@ def TargetExitDataOp: OpenMP_Op<"target_exit_data", traits = [
     device data environment. The target exit data directive is
     a stand-alone directive.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed
-    on the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1056,10 +1057,9 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
     specified motion clauses. The target update construct is a stand-alone
     directive.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed
-    on the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
 
     We use `MapInfoOp` to model the motion clauses and their modifiers. Even
     though the spec differentiates between map-types & map-type-modifiers vs.
@@ -1070,7 +1070,7 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1083,26 +1083,25 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
 def TargetOp : OpenMP_Op<"target", traits = [
     AttrSizedOperandSegments, IsolatedFromAbove, OutlineableOpenMPOpInterface
   ], clauses = [
-    // TODO: Complete clause list (allocate, defaultmap, in_reduction,
-    // uses_allocators).
+    // TODO: Complete clause list (defaultmap, uses_allocators).
     // TODO: Sort clauses alphabetically.
     OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_ThreadLimitClause,
     OpenMP_DependClause, OpenMP_NowaitClause, OpenMP_IsDevicePtrClause,
-    OpenMP_HasDeviceAddrClause, OpenMP_MapClause, OpenMP_PrivateClause
+    OpenMP_HasDeviceAddrClause, OpenMP_MapClause, OpenMP_PrivateClause,
+    OpenMP_AllocateClause, OpenMP_InReductionClause
   ], singleRegion = true> {
   let summary = "target construct";
   let description = [{
     The target construct includes a region of code which is to be executed
     on a device.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed on the
-    host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1125,9 +1124,7 @@ def MasterOp : OpenMP_Op<"master", singleRegion = true> {
 //===----------------------------------------------------------------------===//
 // 2.17.1 critical Construct
 //===----------------------------------------------------------------------===//
-def CriticalDeclareOp : OpenMP_Op<"critical.declare", traits = [
-    Symbol
-  ], clauses = [
+def CriticalDeclareOp : OpenMP_Op<"critical.declare", clauses = [
     OpenMP_CriticalNameClause, OpenMP_HintClause
   ]> {
   let summary = "declares a named critical section.";
@@ -1136,7 +1133,7 @@ def CriticalDeclareOp : OpenMP_Op<"critical.declare", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const CriticalClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const CriticalDeclareOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1189,7 +1186,7 @@ def OrderedOp : OpenMP_Op<"ordered", clauses = [OpenMP_DoacrossClause]> {
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const OrderedOpClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const OrderedOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1206,7 +1203,7 @@ def OrderedRegionOp : OpenMP_Op<"ordered.region", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const OrderedRegionClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const OrderedRegionOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1217,7 +1214,7 @@ def OrderedRegionOp : OpenMP_Op<"ordered.region", clauses = [
 //===----------------------------------------------------------------------===//
 
 def TaskwaitOp : OpenMP_Op<"taskwait", clauses = [
-    // TODO: Complete clause list (depend, nowait).
+    OpenMP_DependClause, OpenMP_NowaitClause
   ]> {
   let summary = "taskwait construct";
   let description = [{
@@ -1226,11 +1223,8 @@ def TaskwaitOp : OpenMP_Op<"taskwait", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskwaitClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskwaitOperands &">:$clauses)>
   ];
-
-  // TODO: Remove overriden `assemblyFormat` once a clause is added.
-  let assemblyFormat = "attr-dict";
 }
 
 //===----------------------------------------------------------------------===//
@@ -1264,8 +1258,8 @@ def AtomicReadOp : OpenMP_Op<"atomic.read", traits = [
   // Override clause-based assemblyFormat.
   let assemblyFormat = [{
     $v `=` $x
-    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`
-          | `hint` `(` custom<SynchronizationHint>($hint_val) `)`)
+    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order) `)`
+          | `hint` `(` custom<SynchronizationHint>($hint) `)`)
     `:` type($x) `,` $element_type attr-dict
   }];
 
@@ -1308,8 +1302,8 @@ def AtomicWriteOp : OpenMP_Op<"atomic.write", traits = [
   // Override clause-based assemblyFormat.
   let assemblyFormat = [{
     $x `=` $expr
-    oilist( `hint` `(` custom<SynchronizationHint>($hint_val) `)`
-          | `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`)
+    oilist( `hint` `(` custom<SynchronizationHint>($hint) `)`
+          | `memory_order` `(` custom<ClauseAttr>($memory_order) `)`)
     `:` type($x) `,` type($expr)
     attr-dict
   }];
@@ -1371,8 +1365,8 @@ def AtomicUpdateOp : OpenMP_Op<"atomic.update", traits = [
 
   // Override clause-based assemblyFormat.
   let assemblyFormat = [{
-    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`
-          | `hint` `(` custom<SynchronizationHint>($hint_val) `)`)
+    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order) `)`
+          | `hint` `(` custom<SynchronizationHint>($hint) `)`)
     $x `:` type($x) $region attr-dict
   }];
 
@@ -1505,7 +1499,7 @@ def CancelOp : OpenMP_Op<"cancel", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const CancelClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const CancelOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1525,7 +1519,7 @@ def CancellationPointOp : OpenMP_Op<"cancellation_point", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const CancellationPointClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const CancellationPointOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1605,7 +1599,7 @@ def MaskedOp : OpenMP_Op<"masked", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const MaskedClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const MaskedOperands &">:$clauses)>
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index 385aa8b1b016a6..45d30a41bd29b7 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -41,14 +41,14 @@ def MapClauseOwningOpInterface : OpInterface<"MapClauseOwningOpInterface"> {
   let cppNamespace = "::mlir::omp";
 
   let methods = [
-    InterfaceMethod<"Get map operands", "::mlir::OperandRange", "getMapOperands",
+    InterfaceMethod<"Get map operands", "::mlir::OperandRange", "getMapVars",
       (ins), [{
-        return $_op.getMapOperands();
+        return $_op.getMapVars();
       }]>,
       InterfaceMethod<"Get mutable map operands", "::mlir::MutableOperandRange",
-                      "getMapOperandsMutable",
+                      "getMapVarsMutable",
       (ins), [{
-        return $_op.getMapOperandsMutable();
+        return $_op.getMapVarsMutable();
       }]>,
   ];
 }
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index fdc4c7be1ca5c3..169171ab799b71 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -366,7 +366,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     // Declare reductions.
     // TODO: consider checking it here is already a compatible reduction
     // declaration and use it instead of redeclaring.
-    SmallVector<Attribute> reductionDeclSymbols;
+    SmallVector<Attribute> reductionSyms;
     SmallVector<omp::DeclareReductionOp> ompReductionDecls;
     auto reduce = cast<scf::ReduceOp>(parallelOp.getBody()->getTerminator());
     for (int64_t i = 0, e = parallelOp.getNumReductions(); i < e; ++i) {
@@ -374,7 +374,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
       ompReductionDecls.push_back(decl);
       if (!decl)
         return failure();
-      reductionDeclSymbols.push_back(
+      reductionSyms.push_back(
           SymbolRefAttr::get(rewriter.getContext(), decl.getSymName()));
     }
 
@@ -445,15 +445,15 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     auto ompParallel = rewriter.create<omp::ParallelOp>(
         loc,
         /* if_expr = */ Value{},
-        /* num_threads_var = */ numThreadsVar,
+        /* num_threads = */ numThreadsVar,
         /* allocate_vars = */ llvm::SmallVector<Value>{},
-        /* allocators_vars = */ llvm::SmallVector<Value>{},
+        /* allocator_vars = */ llvm::SmallVector<Value>{},
         /* reduction_vars = */ llvm::SmallVector<Value>{},
-        /* reduction_vars_isbyref = */ DenseBoolArrayAttr{},
-        /* reductions = */ ArrayAttr{},
-        /* proc_bind_val = */ omp::ClauseProcBindKindAttr{},
+        /* reduction_byref = */ DenseBoolArrayAttr{},
+        /* reduction_syms = */ ArrayAttr{},
+        /* proc_bind_kind = */ omp::ClauseProcBindKindAttr{},
         /* private_vars = */ ValueRange(),
-        /* privatizers = */ nullptr);
+        /* private_syms = */ nullptr);
     {
 
       OpBuilder::InsertionGuard guard(rewriter);
@@ -465,15 +465,15 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
         // Create worksharing loop wrapper.
         auto wsloopOp = rewriter.create<omp::WsloopOp>(parallelOp.getLoc());
         if (!reductionVariables.empty()) {
-          wsloopOp.setReductionsAttr(
-              ArrayAttr::get(rewriter.getContext(), reductionDeclSymbols));
+          wsloopOp.setReductionSymsAttr(
+              ArrayAttr::get(rewriter.getContext(), reductionSyms));
           wsloopOp.getReductionVarsMutable().append(reductionVariables);
-          llvm::SmallVector<bool> byRefVec;
+          llvm::SmallVector<bool> reductionByRef;
           // false because these reductions always reduce scalars and so do
           // not need to pass by reference
-          byRefVec.resize(reductionVariables.size(), false);
-          wsloopOp.setReductionVarsByref(
-              DenseBoolArrayAttr::get(rewriter.getContext(), byRefVec));
+          reductionByRef.resize(reductionVariables.size(), false);
+          wsloopOp.setReductionByref(
+              DenseBoolArrayAttr::get(rewriter.getContext(), reductionByRef));
         }
         rewriter.create<omp::TerminatorOp>(loc); // omp.parallel terminator.
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index f5ec5a476ad8fa..51e9dbe8c801d1 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -117,39 +117,39 @@ void OpenMPDialect::initialize() {
 /// ssa-id-and-type ::= ssa-id `:` type
 static ParseResult parseAllocateAndAllocator(
     OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operandsAllocate,
-    SmallVectorImpl<Type> &typesAllocate,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operandsAllocator,
-    SmallVectorImpl<Type> &typesAllocator) {
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &allocateVars,
+    SmallVectorImpl<Type> &allocateTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &allocatorVars,
+    SmallVectorImpl<Type> &allocatorTypes) {
 
   return parser.parseCommaSeparatedList([&]() {
     OpAsmParser::UnresolvedOperand operand;
     Type type;
     if (parser.parseOperand(operand) || parser.parseColonType(type))
       return failure();
-    operandsAllocator.push_back(operand);
-    typesAllocator.push_back(type);
+    allocatorVars.push_back(operand);
+    allocatorTypes.push_back(type);
     if (parser.parseArrow())
       return failure();
     if (parser.parseOperand(operand) || parser.parseColonType(type))
       return failure();
 
-    operandsAllocate.push_back(operand);
-    typesAllocate.push_back(type);
+    allocateVars.push_back(operand);
+    allocateTypes.push_back(type);
     return success();
   });
 }
 
 /// Print allocate clause
 static void printAllocateAndAllocator(OpAsmPrinter &p, Operation *op,
-                                      OperandRange varsAllocate,
-                                      TypeRange typesAllocate,
-                                      OperandRange varsAllocator,
-                                      TypeRange typesAllocator) {
-  for (unsigned i = 0; i < varsAllocate.size(); ++i) {
-    std::string separator = i == varsAllocate.size() - 1 ? "" : ", ";
-    p << varsAllocator[i] << " : " << typesAllocator[i] << " -> ";
-    p << varsAllocate[i] << " : " << typesAllocate[i] << separator;
+                                      OperandRange allocateVars,
+                                      TypeRange allocateTypes,
+                                      OperandRange allocatorVars,
+                                      TypeRange allocatorTypes) {
+  for (unsigned i = 0; i < allocateVars.size(); ++i) {
+    std::string separator = i == allocateVars.size() - 1 ? "" : ", ";
+    p << allocatorVars[i] << " : " << allocatorTypes[i] << " -> ";
+    p << allocateVars[i] << " : " << allocateTypes[i] << separator;
   }
 }
 
@@ -183,11 +183,11 @@ void printClauseAttr(OpAsmPrinter &p, Operation *op, ClauseAttr attr) {
 /// linear ::= `linear` `(` linear-list `)`
 /// linear-list := linear-val | linear-val linear-list
 /// linear-val := ssa-id-and-type `=` ssa-id-and-type
-static ParseResult
-parseLinearClause(OpAsmParser &parser,
-                  SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars,
-                  SmallVectorImpl<Type> &types,
-                  SmallVectorImpl<OpAsmParser::UnresolvedOperand> &stepVars) {
+static ParseResult parseLinearClause(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &linearVars,
+    SmallVectorImpl<Type> &linearTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &linearStepVars) {
   return parser.parseCommaSeparatedList([&]() {
     OpAsmParser::UnresolvedOperand var;
     Type type;
@@ -196,16 +196,16 @@ parseLinearClause(OpAsmParser &parser,
         parser.parseOperand(stepVar) || parser.parseColonType(type))
       return failure();
 
-    vars.push_back(var);
-    types.push_back(type);
-    stepVars.push_back(stepVar);
+    linearVars.push_back(var);
+    linearTypes.push_back(type);
+    linearStepVars.push_back(stepVar);
     return success();
   });
 }
 
 /// Print Linear Clause
 static void printLinearClause(OpAsmPrinter &p, Operation *op,
-                              ValueRange linearVars, TypeRange linearVarTypes,
+                              ValueRange linearVars, TypeRange linearTypes,
                               ValueRange linearStepVars) {
   size_t linearVarsSize = linearVars.size();
   for (unsigned i = 0; i < linearVarsSize; ++i) {
@@ -221,12 +221,12 @@ static void printLinearClause(OpAsmPrinter &p, Operation *op,
 // Verifier for Nontemporal Clause
 //===----------------------------------------------------------------------===//
 
-static LogicalResult
-verifyNontemporalClause(Operation *op, OperandRange nontemporalVariables) {
+static LogicalResult verifyNontemporalClause(Operation *op,
+                                             OperandRange nontemporalVars) {
 
   // Check if each var is unique - OpenMP 5.0 -> 2.9.3.1 section
   DenseSet<Value> nontemporalItems;
-  for (const auto &it : nontemporalVariables)
+  for (const auto &it : nontemporalVars)
     if (!nontemporalItems.insert(it).second)
       return op->emitOpError() << "nontemporal variable used more than once";
 
@@ -236,32 +236,32 @@ verifyNontemporalClause(Operation *op, OperandRange nontemporalVariables) {
 //===----------------------------------------------------------------------===//
 // Parser, verifier and printer for Aligned Clause
 //===----------------------------------------------------------------------===//
-static LogicalResult
-verifyAlignedClause(Operation *op, std::optional<ArrayAttr> alignmentValues,
-                    OperandRange alignedVariables) {
+static LogicalResult verifyAlignedClause(Operation *op,
+                                         std::optional<ArrayAttr> alignments,
+                                         OperandRange alignedVars) {
   // Check if number of alignment values equals to number of aligned variables
-  if (!alignedVariables.empty()) {
-    if (!alignmentValues || alignmentValues->size() != alignedVariables.size())
+  if (!alignedVars.empty()) {
+    if (!alignments || alignments->size() != alignedVars.size())
       return op->emitOpError()
              << "expected as many alignment values as aligned variables";
   } else {
-    if (alignmentValues)
+    if (alignments)
       return op->emitOpError() << "unexpected alignment values attribute";
     return success();
   }
 
   // Check if each var is aligned only once - OpenMP 4.5 -> 2.8.1 section
   DenseSet<Value> alignedItems;
-  for (auto it : alignedVariables)
+  for (auto it : alignedVars)
     if (!alignedItems.insert(it).second)
       return op->emitOpError() << "aligned variable used more than once";
 
-  if (!alignmentValues)
+  if (!alignments)
     return success();
 
   // Check if all alignment values are positive - OpenMP 4.5 -> 2.8.1 section
-  for (unsigned i = 0; i < (*alignmentValues).size(); ++i) {
-    if (auto intAttr = llvm::dyn_cast<IntegerAttr>((*alignmentValues)[i])) {
+  for (unsigned i = 0; i < (*alignments).size(); ++i) {
+    if (auto intAttr = llvm::dyn_cast<IntegerAttr>((*alignments)[i])) {
       if (intAttr.getValue().sle(0))
         return op->emitOpError() << "alignment should be greater than 0";
     } else {
@@ -275,14 +275,15 @@ verifyAlignedClause(Operation *op, std::optional<ArrayAttr> alignmentValues,
 /// aligned ::= `aligned` `(` aligned-list `)`
 /// aligned-list := aligned-val | aligned-val aligned-list
 /// aligned-val := ssa-id-and-type `->` alignment
-static ParseResult parseAlignedClause(
-    OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &alignedItems,
-    SmallVectorImpl<Type> &types, ArrayAttr &alignmentValues) {
+static ParseResult
+parseAlignedClause(OpAsmParser &parser,
+                   SmallVectorImpl<OpAsmParser::UnresolvedOperand> &alignedVars,
+                   SmallVectorImpl<Type> &alignedTypes,
+                   ArrayAttr &alignmentsAttr) {
   SmallVector<Attribute> alignmentVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseOperand(alignedItems.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()) ||
+        if (parser.parseOperand(alignedVars.emplace_back()) ||
+            parser.parseColonType(alignedTypes.emplace_back()) ||
             parser.parseArrow() ||
             parser.parseAttribute(alignmentVec.emplace_back())) {
           return failure();
@@ -291,20 +292,19 @@ static ParseResult parseAlignedClause(
       })))
     return failure();
   SmallVector<Attribute> alignments(alignmentVec.begin(), alignmentVec.end());
-  alignmentValues = ArrayAttr::get(parser.getContext(), alignments);
+  alignmentsAttr = ArrayAttr::get(parser.getContext(), alignments);
   return success();
 }
 
 /// Print Aligned Clause
 static void printAlignedClause(OpAsmPrinter &p, Operation *op,
-                               ValueRange alignedVars,
-                               TypeRange alignedVarTypes,
-                               std::optional<ArrayAttr> alignmentValues) {
+                               ValueRange alignedVars, TypeRange alignedTypes,
+                               std::optional<ArrayAttr> alignments) {
   for (unsigned i = 0; i < alignedVars.size(); ++i) {
     if (i != 0)
       p << ", ";
     p << alignedVars[i] << " : " << alignedVars[i].getType();
-    p << " -> " << (*alignmentValues)[i];
+    p << " -> " << (*alignments)[i];
   }
 }
 
@@ -353,10 +353,11 @@ verifyScheduleModifiers(OpAsmParser &parser,
 /// sched-wo-chunk ::=  `auto` | `runtime`
 /// sched-modifier ::=  sched-mod-val | sched-mod-val `,` sched-mod-val
 /// sched-mod-val ::=  `monotonic` | `nonmonotonic` | `simd` | `none`
-static ParseResult parseScheduleClause(
-    OpAsmParser &parser, ClauseScheduleKindAttr &scheduleAttr,
-    ScheduleModifierAttr &scheduleModifier, UnitAttr &simdModifier,
-    std::optional<OpAsmParser::UnresolvedOperand> &chunkSize, Type &chunkType) {
+static ParseResult
+parseScheduleClause(OpAsmParser &parser, ClauseScheduleKindAttr &scheduleAttr,
+                    ScheduleModifierAttr &scheduleMod, UnitAttr &scheduleSimd,
+                    std::optional<OpAsmParser::UnresolvedOperand> &chunkSize,
+                    Type &chunkType) {
   StringRef keyword;
   if (parser.parseKeyword(&keyword))
     return failure();
@@ -399,14 +400,14 @@ static ParseResult parseScheduleClause(
     SMLoc loc = parser.getCurrentLocation();
     if (std::optional<ScheduleModifier> mod =
             symbolizeScheduleModifier(modifiers[0])) {
-      scheduleModifier = ScheduleModifierAttr::get(parser.getContext(), *mod);
+      scheduleMod = ScheduleModifierAttr::get(parser.getContext(), *mod);
     } else {
       return parser.emitError(loc, "invalid schedule modifier");
     }
     // Only SIMD attribute is allowed here!
     if (modifiers.size() > 1) {
       assert(symbolizeScheduleModifier(modifiers[1]) == ScheduleModifier::simd);
-      simdModifier = UnitAttr::get(parser.getBuilder().getContext());
+      scheduleSimd = UnitAttr::get(parser.getBuilder().getContext());
     }
   }
 
@@ -415,16 +416,16 @@ static ParseResult parseScheduleClause(
 
 /// Print schedule clause
 static void printScheduleClause(OpAsmPrinter &p, Operation *op,
-                                ClauseScheduleKindAttr schedAttr,
-                                ScheduleModifierAttr modifier, UnitAttr simd,
-                                Value scheduleChunkVar,
+                                ClauseScheduleKindAttr scheduleKind,
+                                ScheduleModifierAttr scheduleMod,
+                                UnitAttr scheduleSimd, Value scheduleChunk,
                                 Type scheduleChunkType) {
-  p << stringifyClauseScheduleKind(schedAttr.getValue());
-  if (scheduleChunkVar)
-    p << " = " << scheduleChunkVar << " : " << scheduleChunkVar.getType();
-  if (modifier)
-    p << ", " << stringifyScheduleModifier(modifier.getValue());
-  if (simd)
+  p << stringifyClauseScheduleKind(scheduleKind.getValue());
+  if (scheduleChunk)
+    p << " = " << scheduleChunk << " : " << scheduleChunk.getType();
+  if (scheduleMod)
+    p << ", " << stringifyScheduleModifier(scheduleMod.getValue());
+  if (scheduleSimd)
     p << ", simd";
 }
 
@@ -435,15 +436,15 @@ static void printScheduleClause(OpAsmPrinter &p, Operation *op,
 // order ::= `order` `(` [order-modiﬁer ':'] concurrent `)`
 // order-modiﬁer ::= reproducible | unconstrained
 static ParseResult parseOrderClause(OpAsmParser &parser,
-                                    ClauseOrderKindAttr &kindAttr,
-                                    OrderModifierAttr &modifierAttr) {
+                                    ClauseOrderKindAttr &order,
+                                    OrderModifierAttr &orderMod) {
   StringRef enumStr;
   SMLoc loc = parser.getCurrentLocation();
   if (parser.parseKeyword(&enumStr))
     return failure();
   if (std::optional<OrderModifier> enumValue =
           symbolizeOrderModifier(enumStr)) {
-    modifierAttr = OrderModifierAttr::get(parser.getContext(), *enumValue);
+    orderMod = OrderModifierAttr::get(parser.getContext(), *enumValue);
     if (parser.parseOptionalColon())
       return failure();
     loc = parser.getCurrentLocation();
@@ -452,19 +453,19 @@ static ParseResult parseOrderClause(OpAsmParser &parser,
   }
   if (std::optional<ClauseOrderKind> enumValue =
           symbolizeClauseOrderKind(enumStr)) {
-    kindAttr = ClauseOrderKindAttr::get(parser.getContext(), *enumValue);
+    order = ClauseOrderKindAttr::get(parser.getContext(), *enumValue);
     return success();
   }
   return parser.emitError(loc, "invalid clause value: '") << enumStr << "'";
 }
 
 static void printOrderClause(OpAsmPrinter &p, Operation *op,
-                             ClauseOrderKindAttr kindAttr,
-                             OrderModifierAttr modifierAttr) {
-  if (modifierAttr)
-    p << stringifyOrderModifier(modifierAttr.getValue()) << ":";
-  if (kindAttr)
-    p << stringifyClauseOrderKind(kindAttr.getValue());
+                             ClauseOrderKindAttr order,
+                             OrderModifierAttr orderMod) {
+  if (orderMod)
+    p << stringifyOrderModifier(orderMod.getValue()) << ":";
+  if (order)
+    p << stringifyClauseOrderKind(order.getValue());
 }
 
 //===----------------------------------------------------------------------===//
@@ -474,8 +475,7 @@ static void printOrderClause(OpAsmPrinter &p, Operation *op,
 static ParseResult parseClauseWithRegionArgs(
     OpAsmParser &parser, Region &region,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-    SmallVectorImpl<Type> &types, DenseBoolArrayAttr &isByRef,
-    ArrayAttr &symbols,
+    SmallVectorImpl<Type> &types, DenseBoolArrayAttr &byref, ArrayAttr &symbols,
     SmallVectorImpl<OpAsmParser::Argument> &regionPrivateArgs) {
   SmallVector<SymbolRefAttr> reductionVec;
   SmallVector<bool> isByRefVec;
@@ -494,7 +494,7 @@ static ParseResult parseClauseWithRegionArgs(
             return success();
           })))
     return failure();
-  isByRef = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
+  byref = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
 
   auto *argsBegin = regionPrivateArgs.begin();
   MutableArrayRef argsSubrange(argsBegin + regionArgOffset,
@@ -510,13 +510,13 @@ static ParseResult parseClauseWithRegionArgs(
 static void printClauseWithRegionArgs(OpAsmPrinter &p, Operation *op,
                                       ValueRange argsSubrange,
                                       StringRef clauseName, ValueRange operands,
-                                      TypeRange types, DenseBoolArrayAttr byRef,
+                                      TypeRange types, DenseBoolArrayAttr byref,
                                       ArrayAttr symbols) {
   if (!clauseName.empty())
     p << clauseName << "(";
 
   llvm::interleaveComma(llvm::zip_equal(symbols, operands, argsSubrange, types,
-                                        byRef.asArrayRef()),
+                                        byref.asArrayRef()),
                         p, [&p](auto t) {
                           auto [sym, op, arg, type, isByRef] = t;
                           p << (isByRef ? "byref " : "") << sym << " " << op
@@ -529,28 +529,27 @@ static void printClauseWithRegionArgs(OpAsmPrinter &p, Operation *op,
 
 static ParseResult parseParallelRegion(
     OpAsmParser &parser, Region &region,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVarOperands,
-    SmallVectorImpl<Type> &reductionVarTypes,
-    DenseBoolArrayAttr &reductionByRef, ArrayAttr &reductionSymbols,
-    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVarOperands,
-    llvm::SmallVectorImpl<Type> &privateVarsTypes,
-    ArrayAttr &privatizerSymbols) {
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVars,
+    SmallVectorImpl<Type> &reductionTypes, DenseBoolArrayAttr &reductionByref,
+    ArrayAttr &reductionSyms,
+    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+    llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
   llvm::SmallVector<OpAsmParser::Argument> regionPrivateArgs;
 
   if (succeeded(parser.parseOptionalKeyword("reduction"))) {
-    if (failed(parseClauseWithRegionArgs(parser, region, reductionVarOperands,
-                                         reductionVarTypes, reductionByRef,
-                                         reductionSymbols, regionPrivateArgs)))
+    if (failed(parseClauseWithRegionArgs(parser, region, reductionVars,
+                                         reductionTypes, reductionByref,
+                                         reductionSyms, regionPrivateArgs)))
       return failure();
   }
 
   if (succeeded(parser.parseOptionalKeyword("private"))) {
-    auto privateByRef = DenseBoolArrayAttr::get(parser.getContext(), {});
-    if (failed(parseClauseWithRegionArgs(parser, region, privateVarOperands,
-                                         privateVarsTypes, privateByRef,
-                                         privatizerSymbols, regionPrivateArgs)))
+    auto privateByref = DenseBoolArrayAttr::get(parser.getContext(), {});
+    if (failed(parseClauseWithRegionArgs(parser, region, privateVars,
+                                         privateTypes, privateByref,
+                                         privateSyms, regionPrivateArgs)))
       return failure();
-    if (llvm::any_of(privateByRef.asArrayRef(),
+    if (llvm::any_of(privateByref.asArrayRef(),
                      [](bool byref) { return byref; })) {
       parser.emitError(parser.getCurrentLocation(),
                        "private clause cannot have byref attributes");
@@ -562,35 +561,30 @@ static ParseResult parseParallelRegion(
 }
 
 static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
-                                ValueRange reductionVarOperands,
-                                TypeRange reductionVarTypes,
-                                DenseBoolArrayAttr reductionVarIsByRef,
-                                ArrayAttr reductionSymbols,
-                                ValueRange privateVarOperands,
-                                TypeRange privateVarTypes,
-                                ArrayAttr privatizerSymbols) {
-  if (reductionSymbols) {
+                                ValueRange reductionVars,
+                                TypeRange reductionTypes,
+                                DenseBoolArrayAttr reductionByref,
+                                ArrayAttr reductionSyms, ValueRange privateVars,
+                                TypeRange privateTypes, ArrayAttr privateSyms) {
+  if (reductionSyms) {
     auto *argsBegin = region.front().getArguments().begin();
-    MutableArrayRef argsSubrange(argsBegin,
-                                 argsBegin + reductionVarTypes.size());
-    printClauseWithRegionArgs(p, op, argsSubrange, "reduction",
-                              reductionVarOperands, reductionVarTypes,
-                              reductionVarIsByRef, reductionSymbols);
+    MutableArrayRef argsSubrange(argsBegin, argsBegin + reductionTypes.size());
+    printClauseWithRegionArgs(p, op, argsSubrange, "reduction", reductionVars,
+                              reductionTypes, reductionByref, reductionSyms);
   }
 
-  if (privatizerSymbols) {
+  if (privateSyms) {
     auto *argsBegin = region.front().getArguments().begin();
-    MutableArrayRef argsSubrange(argsBegin + reductionVarOperands.size(),
-                                 argsBegin + reductionVarOperands.size() +
-                                     privateVarTypes.size());
+    MutableArrayRef argsSubrange(argsBegin + reductionVars.size(),
+                                 argsBegin + reductionVars.size() +
+                                     privateTypes.size());
     mlir::SmallVector<bool> isByRefVec;
-    isByRefVec.resize(privateVarTypes.size(), false);
+    isByRefVec.resize(privateTypes.size(), false);
     DenseBoolArrayAttr isByRef =
         makeDenseBoolArrayAttr(op->getContext(), isByRefVec);
 
-    printClauseWithRegionArgs(p, op, argsSubrange, "private",
-                              privateVarOperands, privateVarTypes, isByRef,
-                              privatizerSymbols);
+    printClauseWithRegionArgs(p, op, argsSubrange, "private", privateVars,
+                              privateTypes, isByRef, privateSyms);
   }
 
   p.printRegion(region, /*printEntryBlockArgs=*/false);
@@ -599,41 +593,41 @@ static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
 /// reduction-entry-list ::= reduction-entry
 ///                        | reduction-entry-list `,` reduction-entry
 /// reduction-entry ::= (`byref`)? symbol-ref `->` ssa-id `:` type
-static ParseResult
-parseReductionVarList(OpAsmParser &parser,
-                      SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-                      SmallVectorImpl<Type> &types, DenseBoolArrayAttr &isByRef,
-                      ArrayAttr &reductionSymbols) {
+static ParseResult parseReductionVarList(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVars,
+    SmallVectorImpl<Type> &reductionTypes, DenseBoolArrayAttr &reductionByref,
+    ArrayAttr &reductionSyms) {
   SmallVector<SymbolRefAttr> reductionVec;
   SmallVector<bool> isByRefVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
         ParseResult optionalByref = parser.parseOptionalKeyword("byref");
         if (parser.parseAttribute(reductionVec.emplace_back()) ||
             parser.parseArrow() ||
-            parser.parseOperand(operands.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
+            parser.parseOperand(reductionVars.emplace_back()) ||
+            parser.parseColonType(reductionTypes.emplace_back()))
           return failure();
         isByRefVec.push_back(optionalByref.succeeded());
         return success();
       })))
     return failure();
-  isByRef = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
+  reductionByref = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
   SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
-  reductionSymbols = ArrayAttr::get(parser.getContext(), reductions);
+  reductionSyms = ArrayAttr::get(parser.getContext(), reductions);
   return success();
 }
 
 /// Print Reduction clause
-static void printReductionVarList(OpAsmPrinter &p, Operation *op,
-                                  OperandRange reductionVars,
-                                  TypeRange reductionTypes,
-                                  std::optional<DenseBoolArrayAttr> isByRef,
-                                  std::optional<ArrayAttr> reductions) {
+static void
+printReductionVarList(OpAsmPrinter &p, Operation *op,
+                      OperandRange reductionVars, TypeRange reductionTypes,
+                      std::optional<DenseBoolArrayAttr> reductionByref,
+                      std::optional<ArrayAttr> reductionSyms) {
   auto getByRef = [&](unsigned i) -> const char * {
-    if (!isByRef || !*isByRef)
+    if (!reductionByref || !*reductionByref)
       return "";
-    assert(isByRef->empty() || i < isByRef->size());
-    if (!isByRef->empty() && (*isByRef)[i])
+    assert(reductionByref->empty() || i < reductionByref->size());
+    if (!reductionByref->empty() && (*reductionByref)[i])
       return "byref ";
     return "";
   };
@@ -641,26 +635,26 @@ static void printReductionVarList(OpAsmPrinter &p, Operation *op,
   for (unsigned i = 0, e = reductionVars.size(); i < e; ++i) {
     if (i != 0)
       p << ", ";
-    p << getByRef(i) << (*reductions)[i] << " -> " << reductionVars[i] << " : "
-      << reductionVars[i].getType();
+    p << getByRef(i) << (*reductionSyms)[i] << " -> " << reductionVars[i]
+      << " : " << reductionVars[i].getType();
   }
 }
 
 /// Verifies Reduction Clause
 static LogicalResult
-verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
+verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductionSyms,
                        OperandRange reductionVars,
-                       std::optional<ArrayRef<bool>> byRef) {
+                       std::optional<ArrayRef<bool>> reductionByref) {
   if (!reductionVars.empty()) {
-    if (!reductions || reductions->size() != reductionVars.size())
+    if (!reductionSyms || reductionSyms->size() != reductionVars.size())
       return op->emitOpError()
              << "expected as many reduction symbol references "
                 "as reduction variables";
-    if (byRef && byRef->size() != reductionVars.size())
+    if (reductionByref && reductionByref->size() != reductionVars.size())
       return op->emitError() << "expected as many reduction variable by "
                                 "reference attributes as reduction variables";
   } else {
-    if (reductions)
+    if (reductionSyms)
       return op->emitOpError() << "unexpected reduction symbol references";
     return success();
   }
@@ -668,7 +662,7 @@ verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
   // TODO: The followings should be done in
   // SymbolUserOpInterface::verifySymbolUses.
   DenseSet<Value> accumulators;
-  for (auto args : llvm::zip(reductionVars, *reductions)) {
+  for (auto args : llvm::zip(reductionVars, *reductionSyms)) {
     Value accum = std::get<0>(args);
 
     if (!accumulators.insert(accum).second)
@@ -693,41 +687,40 @@ verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
 }
 
 //===----------------------------------------------------------------------===//
-// Parser, printer and verifier for CopyPrivateVarList
+// Parser, printer and verifier for Copyprivate
 //===----------------------------------------------------------------------===//
 
 /// copyprivate-entry-list ::= copyprivate-entry
 ///                          | copyprivate-entry-list `,` copyprivate-entry
 /// copyprivate-entry ::= ssa-id `->` symbol-ref `:` type
-static ParseResult parseCopyPrivateVarList(
+static ParseResult parseCopyprivate(
     OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-    SmallVectorImpl<Type> &types, ArrayAttr &copyPrivateSymbols) {
-  SmallVector<SymbolRefAttr> copyPrivateFuncsVec;
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &copyprivateVars,
+    SmallVectorImpl<Type> &copyprivateTypes, ArrayAttr &copyprivateSyms) {
+  SmallVector<SymbolRefAttr> symsVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseOperand(operands.emplace_back()) ||
+        if (parser.parseOperand(copyprivateVars.emplace_back()) ||
             parser.parseArrow() ||
-            parser.parseAttribute(copyPrivateFuncsVec.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
+            parser.parseAttribute(symsVec.emplace_back()) ||
+            parser.parseColonType(copyprivateTypes.emplace_back()))
           return failure();
         return success();
       })))
     return failure();
-  SmallVector<Attribute> copyPrivateFuncs(copyPrivateFuncsVec.begin(),
-                                          copyPrivateFuncsVec.end());
-  copyPrivateSymbols = ArrayAttr::get(parser.getContext(), copyPrivateFuncs);
+  SmallVector<Attribute> syms(symsVec.begin(), symsVec.end());
+  copyprivateSyms = ArrayAttr::get(parser.getContext(), syms);
   return success();
 }
 
-/// Print CopyPrivate clause
-static void printCopyPrivateVarList(OpAsmPrinter &p, Operation *op,
-                                    OperandRange copyPrivateVars,
-                                    TypeRange copyPrivateTypes,
-                                    std::optional<ArrayAttr> copyPrivateFuncs) {
-  if (!copyPrivateFuncs.has_value())
+/// Print Copyprivate clause
+static void printCopyprivate(OpAsmPrinter &p, Operation *op,
+                             OperandRange copyprivateVars,
+                             TypeRange copyprivateTypes,
+                             std::optional<ArrayAttr> copyprivateSyms) {
+  if (!copyprivateSyms.has_value())
     return;
   llvm::interleaveComma(
-      llvm::zip(copyPrivateVars, *copyPrivateFuncs, copyPrivateTypes), p,
+      llvm::zip(copyprivateVars, *copyprivateSyms, copyprivateTypes), p,
       [&](const auto &args) {
         p << std::get<0>(args) << " -> " << std::get<1>(args) << " : "
           << std::get<2>(args);
@@ -736,22 +729,22 @@ static void printCopyPrivateVarList(OpAsmPrinter &p, Operation *op,
 
 /// Verifies CopyPrivate Clause
 static LogicalResult
-verifyCopyPrivateVarList(Operation *op, OperandRange copyPrivateVars,
-                         std::optional<ArrayAttr> copyPrivateFuncs) {
-  size_t copyPrivateFuncsSize =
-      copyPrivateFuncs.has_value() ? copyPrivateFuncs->size() : 0;
-  if (copyPrivateFuncsSize != copyPrivateVars.size())
-    return op->emitOpError() << "inconsistent number of copyPrivate vars (= "
-                             << copyPrivateVars.size()
-                             << ") and functions (= " << copyPrivateFuncsSize
+verifyCopyprivateVarList(Operation *op, OperandRange copyprivateVars,
+                         std::optional<ArrayAttr> copyprivateSyms) {
+  size_t copyprivateSymsSize =
+      copyprivateSyms.has_value() ? copyprivateSyms->size() : 0;
+  if (copyprivateSymsSize != copyprivateVars.size())
+    return op->emitOpError() << "inconsistent number of copyprivate vars (= "
+                             << copyprivateVars.size()
+                             << ") and functions (= " << copyprivateSymsSize
                              << "), both must be equal";
-  if (!copyPrivateFuncs.has_value())
+  if (!copyprivateSyms.has_value())
     return success();
 
-  for (auto copyPrivateVarAndFunc :
-       llvm::zip(copyPrivateVars, *copyPrivateFuncs)) {
+  for (auto copyprivateVarAndSym :
+       llvm::zip(copyprivateVars, *copyprivateSyms)) {
     auto symbolRef =
-        llvm::cast<SymbolRefAttr>(std::get<1>(copyPrivateVarAndFunc));
+        llvm::cast<SymbolRefAttr>(std::get<1>(copyprivateVarAndSym));
     std::optional<std::variant<mlir::func::FuncOp, mlir::LLVM::LLVMFuncOp>>
         funcOp;
     if (mlir::func::FuncOp mlirFuncOp =
@@ -785,7 +778,7 @@ verifyCopyPrivateVarList(Operation *op, OperandRange copyPrivateVars,
       return op->emitOpError() << "expected copy function " << symbolRef
                                << " arguments to have the same type";
 
-    Type varType = std::get<0>(copyPrivateVarAndFunc).getType();
+    Type varType = std::get<0>(copyprivateVarAndSym).getType();
     if (argTy != varType)
       return op->emitOpError()
              << "expected copy function arguments' type (" << argTy
@@ -805,39 +798,39 @@ verifyCopyPrivateVarList(Operation *op, OperandRange copyPrivateVars,
 /// depend-entry ::= depend-kind `->` ssa-id `:` type
 static ParseResult
 parseDependVarList(OpAsmParser &parser,
-                   SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-                   SmallVectorImpl<Type> &types, ArrayAttr &dependsArray) {
-  SmallVector<ClauseTaskDependAttr> dependVec;
+                   SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dependVars,
+                   SmallVectorImpl<Type> &dependTypes, ArrayAttr &dependKinds) {
+  SmallVector<ClauseTaskDependAttr> kindsVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
         StringRef keyword;
         if (parser.parseKeyword(&keyword) || parser.parseArrow() ||
-            parser.parseOperand(operands.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
+            parser.parseOperand(dependVars.emplace_back()) ||
+            parser.parseColonType(dependTypes.emplace_back()))
           return failure();
         if (std::optional<ClauseTaskDepend> keywordDepend =
                 (symbolizeClauseTaskDepend(keyword)))
-          dependVec.emplace_back(
+          kindsVec.emplace_back(
               ClauseTaskDependAttr::get(parser.getContext(), *keywordDepend));
         else
           return failure();
         return success();
       })))
     return failure();
-  SmallVector<Attribute> depends(dependVec.begin(), dependVec.end());
-  dependsArray = ArrayAttr::get(parser.getContext(), depends);
+  SmallVector<Attribute> kinds(kindsVec.begin(), kindsVec.end());
+  dependKinds = ArrayAttr::get(parser.getContext(), kinds);
   return success();
 }
 
 /// Print Depend clause
 static void printDependVarList(OpAsmPrinter &p, Operation *op,
                                OperandRange dependVars, TypeRange dependTypes,
-                               std::optional<ArrayAttr> depends) {
+                               std::optional<ArrayAttr> dependKinds) {
 
-  for (unsigned i = 0, e = depends->size(); i < e; ++i) {
+  for (unsigned i = 0, e = dependKinds->size(); i < e; ++i) {
     if (i != 0)
       p << ", ";
     p << stringifyClauseTaskDepend(
-             llvm::cast<mlir::omp::ClauseTaskDependAttr>((*depends)[i])
+             llvm::cast<mlir::omp::ClauseTaskDependAttr>((*dependKinds)[i])
                  .getValue())
       << " -> " << dependVars[i] << " : " << dependTypes[i];
   }
@@ -845,14 +838,14 @@ static void printDependVarList(OpAsmPrinter &p, Operation *op,
 
 /// Verifies Depend clause
 static LogicalResult verifyDependVarList(Operation *op,
-                                         std::optional<ArrayAttr> depends,
+                                         std::optional<ArrayAttr> dependKinds,
                                          OperandRange dependVars) {
   if (!dependVars.empty()) {
-    if (!depends || depends->size() != dependVars.size())
+    if (!dependKinds || dependKinds->size() != dependVars.size())
       return op->emitOpError() << "expected as many depend values"
                                   " as depend variables";
   } else {
-    if (depends && !depends->empty())
+    if (dependKinds && !dependKinds->empty())
       return op->emitOpError() << "unexpected depend values";
     return success();
   }
@@ -1144,8 +1137,8 @@ static void printMembersIndex(OpAsmPrinter &p, MapInfoOp op,
 
 static ParseResult
 parseMapEntries(OpAsmParser &parser,
-                SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapOperands,
-                SmallVectorImpl<Type> &mapOperandTypes) {
+                SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapVars,
+                SmallVectorImpl<Type> &mapTypes) {
   OpAsmParser::UnresolvedOperand arg;
   OpAsmParser::UnresolvedOperand blockArg;
   Type argType;
@@ -1154,14 +1147,14 @@ parseMapEntries(OpAsmParser &parser,
       return failure();
     if (succeeded(parser.parseOptionalArrow()) && parser.parseOperand(blockArg))
       return failure();
-    mapOperands.push_back(arg);
+    mapVars.push_back(arg);
     return success();
   };
 
   auto parseTypes = [&]() -> ParseResult {
     if (parser.parseType(argType))
       return failure();
-    mapOperandTypes.push_back(argType);
+    mapTypes.push_back(argType);
     return success();
   };
 
@@ -1178,48 +1171,47 @@ parseMapEntries(OpAsmParser &parser,
 }
 
 static void printMapEntries(OpAsmPrinter &p, Operation *op,
-                            OperandRange mapOperands,
-                            TypeRange mapOperandTypes) {
+                            OperandRange mapVars, TypeRange mapTypes) {
   // Get pointer to the region if this is an omp.target, because printing map
   // clauses for that operation has to also show the correspondence of each
   // variable to the corresponding block argument.
   Block *entryBlock = isa<TargetOp>(op) ? &op->getRegion(0).front() : nullptr;
   unsigned argIndex = 0;
 
-  for (const auto &mapOp : mapOperands) {
+  for (const auto &mapOp : mapVars) {
     p << mapOp;
     if (entryBlock) {
       const auto &blockArg = entryBlock->getArgument(argIndex);
       p << " -> " << blockArg;
     }
     argIndex++;
-    if (argIndex < mapOperands.size())
+    if (argIndex < mapVars.size())
       p << ", ";
   }
   p << " : ";
 
   argIndex = 0;
-  for (const auto &mapType : mapOperandTypes) {
+  for (const auto &mapType : mapTypes) {
     p << mapType;
     argIndex++;
-    if (argIndex < mapOperands.size())
+    if (argIndex < mapVars.size())
       p << ", ";
   }
 }
 
-static ParseResult parsePrivateList(
-    OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateOperands,
-    SmallVectorImpl<Type> &privateOperandTypes, ArrayAttr &privatizerSymbols) {
+static ParseResult
+parsePrivateList(OpAsmParser &parser,
+                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+                 SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
   SmallVector<SymbolRefAttr> privateSymRefs;
   SmallVector<OpAsmParser::Argument> regionPrivateArgs;
 
   if (failed(parser.parseCommaSeparatedList([&]() {
         if (parser.parseAttribute(privateSymRefs.emplace_back()) ||
-            parser.parseOperand(privateOperands.emplace_back()) ||
+            parser.parseOperand(privateVars.emplace_back()) ||
             parser.parseArrow() ||
             parser.parseArgument(regionPrivateArgs.emplace_back()) ||
-            parser.parseColonType(privateOperandTypes.emplace_back()))
+            parser.parseColonType(privateTypes.emplace_back()))
           return failure();
         return success();
       })))
@@ -1227,32 +1219,31 @@ static ParseResult parsePrivateList(
 
   SmallVector<Attribute> privateSymAttrs(privateSymRefs.begin(),
                                          privateSymRefs.end());
-  privatizerSymbols = ArrayAttr::get(parser.getContext(), privateSymAttrs);
+  privateSyms = ArrayAttr::get(parser.getContext(), privateSymAttrs);
 
   return success();
 }
 
 static void printPrivateList(OpAsmPrinter &p, Operation *op,
-                             ValueRange privateVarOperands,
-                             TypeRange privateVarTypes,
-                             ArrayAttr privatizerSymbols) {
+                             ValueRange privateVars, TypeRange privateTypes,
+                             ArrayAttr privateSyms) {
   // TODO: Remove target-specific logic from this function.
   auto targetOp = mlir::dyn_cast<mlir::omp::TargetOp>(op);
   assert(targetOp);
 
   auto &region = op->getRegion(0);
   auto *argsBegin = region.front().getArguments().begin();
-  MutableArrayRef argsSubrange(argsBegin + targetOp.getMapOperands().size(),
-                               argsBegin + targetOp.getMapOperands().size() +
-                                   privateVarTypes.size());
+  MutableArrayRef argsSubrange(argsBegin + targetOp.getMapVars().size(),
+                               argsBegin + targetOp.getMapVars().size() +
+                                   privateTypes.size());
   mlir::SmallVector<bool> isByRefVec;
-  isByRefVec.resize(privateVarTypes.size(), false);
+  isByRefVec.resize(privateTypes.size(), false);
   DenseBoolArrayAttr isByRef =
       DenseBoolArrayAttr::get(op->getContext(), isByRefVec);
 
-  printClauseWithRegionArgs(
-      p, op, argsSubrange, /*clauseName=*/llvm::StringRef{}, privateVarOperands,
-      privateVarTypes, isByRef, privatizerSymbols);
+  printClauseWithRegionArgs(p, op, argsSubrange,
+                            /*clauseName=*/llvm::StringRef{}, privateVars,
+                            privateTypes, isByRef, privateSyms);
 }
 
 static void printCaptureType(OpAsmPrinter &p, Operation *op,
@@ -1271,32 +1262,32 @@ static void printCaptureType(OpAsmPrinter &p, Operation *op,
 }
 
 static ParseResult parseCaptureType(OpAsmParser &parser,
-                                    VariableCaptureKindAttr &mapCapture) {
+                                    VariableCaptureKindAttr &mapCaptureType) {
   StringRef mapCaptureKey;
   if (parser.parseKeyword(&mapCaptureKey))
     return failure();
 
   if (mapCaptureKey == "This")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::This);
   if (mapCaptureKey == "ByRef")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::ByRef);
   if (mapCaptureKey == "ByCopy")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::ByCopy);
   if (mapCaptureKey == "VLAType")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::VLAType);
 
   return success();
 }
 
-static LogicalResult verifyMapClause(Operation *op, OperandRange mapOperands) {
+static LogicalResult verifyMapClause(Operation *op, OperandRange mapVars) {
   llvm::DenseSet<mlir::TypedValue<mlir::omp::PointerLikeType>> updateToVars;
   llvm::DenseSet<mlir::TypedValue<mlir::omp::PointerLikeType>> updateFromVars;
 
-  for (auto mapOp : mapOperands) {
+  for (auto mapOp : mapVars) {
     if (!mapOp.getDefiningOp())
       emitError(op->getLoc(), "missing map operation");
 
@@ -1378,19 +1369,20 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapOperands) {
 //===----------------------------------------------------------------------===//
 
 void TargetDataOp::build(OpBuilder &builder, OperationState &state,
-                         const TargetDataClauseOps &clauses) {
-  TargetDataOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
+                         const TargetDataOperands &clauses) {
+  TargetDataOp::build(builder, state, clauses.ifVar, clauses.device,
                       clauses.useDevicePtrVars, clauses.useDeviceAddrVars,
                       clauses.mapVars);
 }
 
 LogicalResult TargetDataOp::verify() {
-  if (getMapOperands().empty() && getUseDevicePtr().empty() &&
-      getUseDeviceAddr().empty()) {
-    return ::emitError(this->getLoc(), "At least one of map, useDevicePtr, or "
-                                       "useDeviceAddr operand must be present");
+  if (getMapVars().empty() && getUseDevicePtrVars().empty() &&
+      getUseDeviceAddrVars().empty()) {
+    return ::emitError(this->getLoc(),
+                       "At least one of map, use_device_ptr_vars, or "
+                       "use_device_addr_vars operand must be present");
   }
-  return verifyMapClause(*this, getMapOperands());
+  return verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1399,40 +1391,37 @@ LogicalResult TargetDataOp::verify() {
 
 void TargetEnterDataOp::build(
     OpBuilder &builder, OperationState &state,
-    const TargetEnterExitUpdateDataClauseOps &clauses) {
+    const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TargetEnterDataOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                           makeArrayAttr(ctx, clauses.dependTypeAttrs),
-                           clauses.dependVars, clauses.nowaitAttr,
-                           clauses.mapVars);
+  TargetEnterDataOp::build(builder, state, clauses.ifVar, clauses.device,
+                           makeArrayAttr(ctx, clauses.dependKinds),
+                           clauses.dependVars, clauses.nowait, clauses.mapVars);
 }
 
 LogicalResult TargetEnterDataOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
 // TargetExitDataOp
 //===----------------------------------------------------------------------===//
 
-void TargetExitDataOp::build(
-    OpBuilder &builder, OperationState &state,
-    const TargetEnterExitUpdateDataClauseOps &clauses) {
+void TargetExitDataOp::build(OpBuilder &builder, OperationState &state,
+                             const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TargetExitDataOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                          makeArrayAttr(ctx, clauses.dependTypeAttrs),
-                          clauses.dependVars, clauses.nowaitAttr,
-                          clauses.mapVars);
+  TargetExitDataOp::build(builder, state, clauses.ifVar, clauses.device,
+                          makeArrayAttr(ctx, clauses.dependKinds),
+                          clauses.dependVars, clauses.nowait, clauses.mapVars);
 }
 
 LogicalResult TargetExitDataOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1440,19 +1429,18 @@ LogicalResult TargetExitDataOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TargetUpdateOp::build(OpBuilder &builder, OperationState &state,
-                           const TargetEnterExitUpdateDataClauseOps &clauses) {
+                           const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TargetUpdateOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                        makeArrayAttr(ctx, clauses.dependTypeAttrs),
-                        clauses.dependVars, clauses.nowaitAttr,
-                        clauses.mapVars);
+  TargetUpdateOp::build(builder, state, clauses.ifVar, clauses.device,
+                        makeArrayAttr(ctx, clauses.dependKinds),
+                        clauses.dependVars, clauses.nowait, clauses.mapVars);
 }
 
 LogicalResult TargetUpdateOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1460,24 +1448,25 @@ LogicalResult TargetUpdateOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TargetOp::build(OpBuilder &builder, OperationState &state,
-                     const TargetClauseOps &clauses) {
+                     const TargetOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: allocateVars, allocatorVars, inReductionVars,
-  // inReduceVarByRef, inReductionDeclSymbols, reductionVars, reduceVarByRef,
-  // reductionDeclSymbols.
-  TargetOp::build(
-      builder, state, clauses.ifVar, clauses.deviceVar, clauses.threadLimitVar,
-      makeArrayAttr(ctx, clauses.dependTypeAttrs), clauses.dependVars,
-      clauses.nowaitAttr, clauses.isDevicePtrVars, clauses.hasDeviceAddrVars,
-      clauses.mapVars, clauses.privateVars,
-      makeArrayAttr(ctx, clauses.privatizers));
+  // inReductionByref, inReductionSyms.
+  TargetOp::build(builder, state, clauses.ifVar, clauses.device,
+                  clauses.threadLimit, makeArrayAttr(ctx, clauses.dependKinds),
+                  clauses.dependVars, clauses.nowait, clauses.isDevicePtrVars,
+                  clauses.hasDeviceAddrVars, clauses.mapVars,
+                  clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms),
+                  /*allocate_vars=*/{}, /*allocator_vars=*/{},
+                  /*in_reduction_vars=*/{}, /*in_reduction_byref=*/nullptr,
+                  /*in_reduction_syms=*/nullptr);
 }
 
 LogicalResult TargetOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1487,55 +1476,52 @@ LogicalResult TargetOp::verify() {
 void ParallelOp::build(OpBuilder &builder, OperationState &state,
                        ArrayRef<NamedAttribute> attributes) {
   ParallelOp::build(
-      builder, state, /*if_expr=*/nullptr, /*num_threads_var=*/nullptr,
-      /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
-      /*reduction_vars=*/ValueRange(), /*reduction_vars_byref=*/nullptr,
-      /*reductions=*/nullptr, /*proc_bind_val=*/nullptr,
-      /*private_vars=*/ValueRange(), /*privatizers=*/nullptr);
+      builder, state, /*if_expr=*/nullptr, /*num_threads=*/nullptr,
+      /*allocate_vars=*/ValueRange(), /*allocator_vars=*/ValueRange(),
+      /*reduction_vars=*/ValueRange(), /*reduction_byref=*/nullptr,
+      /*reduction_syms=*/nullptr, /*proc_bind_kind=*/nullptr,
+      /*private_vars=*/ValueRange(), /*private_syms=*/nullptr);
   state.addAttributes(attributes);
 }
 
 void ParallelOp::build(OpBuilder &builder, OperationState &state,
-                       const ParallelClauseOps &clauses) {
+                       const ParallelOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
 
-  ParallelOp::build(builder, state, clauses.ifVar, clauses.numThreadsVar,
-                    clauses.allocateVars, clauses.allocatorVars,
-                    clauses.reductionVars,
-                    makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                    makeArrayAttr(ctx, clauses.reductionDeclSymbols),
-                    clauses.procBindKindAttr, clauses.privateVars,
-                    makeArrayAttr(ctx, clauses.privatizers));
+  ParallelOp::build(
+      builder, state, clauses.ifVar, clauses.numThreads, clauses.allocateVars,
+      clauses.allocatorVars, clauses.reductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+      makeArrayAttr(ctx, clauses.reductionSyms), clauses.procBindKind,
+      clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms));
 }
 
 template <typename OpType>
 static LogicalResult verifyPrivateVarList(OpType &op) {
   auto privateVars = op.getPrivateVars();
-  auto privatizers = op.getPrivatizersAttr();
+  auto privateSyms = op.getPrivateSymsAttr();
 
-  if (privateVars.empty() && (privatizers == nullptr || privatizers.empty()))
+  if (privateVars.empty() && (privateSyms == nullptr || privateSyms.empty()))
     return success();
 
   auto numPrivateVars = privateVars.size();
-  auto numPrivatizers = (privatizers == nullptr) ? 0 : privatizers.size();
+  auto numPrivateSyms = (privateSyms == nullptr) ? 0 : privateSyms.size();
 
-  if (numPrivateVars != numPrivatizers)
+  if (numPrivateVars != numPrivateSyms)
     return op.emitError() << "inconsistent number of private variables and "
                              "privatizer op symbols, private vars: "
                           << numPrivateVars
-                          << " vs. privatizer op symbols: " << numPrivatizers;
+                          << " vs. privatizer op symbols: " << numPrivateSyms;
 
-  for (auto privateVarInfo : llvm::zip_equal(privateVars, privatizers)) {
+  for (auto privateVarInfo : llvm::zip_equal(privateVars, privateSyms)) {
     Type varType = std::get<0>(privateVarInfo).getType();
-    SymbolRefAttr privatizerSym =
-        cast<SymbolRefAttr>(std::get<1>(privateVarInfo));
+    SymbolRefAttr privateSym = cast<SymbolRefAttr>(std::get<1>(privateVarInfo));
     PrivateClauseOp privatizerOp =
-        SymbolTable::lookupNearestSymbolFrom<PrivateClauseOp>(op,
-                                                              privatizerSym);
+        SymbolTable::lookupNearestSymbolFrom<PrivateClauseOp>(op, privateSym);
 
     if (privatizerOp == nullptr)
       return op.emitError() << "failed to lookup privatizer op with symbol: '"
-                            << privatizerSym << "'";
+                            << privateSym << "'";
 
     Type privatizerType = privatizerOp.getType();
 
@@ -1570,15 +1556,15 @@ LogicalResult ParallelOp::verify() {
     }
   }
 
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
   if (failed(verifyPrivateVarList(*this)))
     return failure();
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1593,15 +1579,15 @@ static bool opInGlobalImplicitParallelRegion(Operation *op) {
 }
 
 void TeamsOp::build(OpBuilder &builder, OperationState &state,
-                    const TeamsClauseOps &clauses) {
+                    const TeamsOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
-  TeamsOp::build(builder, state, clauses.numTeamsLowerVar,
-                 clauses.numTeamsUpperVar, clauses.ifVar,
-                 clauses.threadLimitVar, clauses.allocateVars,
+  // TODO Store clauses in op: privateVars, privateSyms.
+  TeamsOp::build(builder, state, clauses.numTeamsLower, clauses.numTeamsUpper,
+                 clauses.ifVar, clauses.threadLimit, clauses.allocateVars,
                  clauses.allocatorVars, clauses.reductionVars,
-                 makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                 makeArrayAttr(ctx, clauses.reductionDeclSymbols));
+                 makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                 makeArrayAttr(ctx, clauses.reductionSyms), /*private_vars=*/{},
+                 /*private_syms=*/nullptr);
 }
 
 LogicalResult TeamsOp::verify() {
@@ -1628,12 +1614,12 @@ LogicalResult TeamsOp::verify() {
   }
 
   // Check for allocate clause restrictions
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1641,23 +1627,23 @@ LogicalResult TeamsOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void SectionsOp::build(OpBuilder &builder, OperationState &state,
-                       const SectionsClauseOps &clauses) {
+                       const SectionsOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
+  // TODO Store clauses in op: privateVars, privateSyms.
   SectionsOp::build(builder, state, clauses.reductionVars,
-                    makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                    makeArrayAttr(ctx, clauses.reductionDeclSymbols),
-                    clauses.allocateVars, clauses.allocatorVars,
-                    clauses.nowaitAttr);
+                    makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                    makeArrayAttr(ctx, clauses.reductionSyms),
+                    clauses.allocateVars, clauses.allocatorVars, clauses.nowait,
+                    /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 LogicalResult SectionsOp::verify() {
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 LogicalResult SectionsOp::verifyRegions() {
@@ -1676,23 +1662,23 @@ LogicalResult SectionsOp::verifyRegions() {
 //===----------------------------------------------------------------------===//
 
 void SingleOp::build(OpBuilder &builder, OperationState &state,
-                     const SingleClauseOps &clauses) {
+                     const SingleOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
+  // TODO Store clauses in op: privateVars, privateSyms.
   SingleOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
                   clauses.copyprivateVars,
-                  makeArrayAttr(ctx, clauses.copyprivateFuncs),
-                  clauses.nowaitAttr);
+                  makeArrayAttr(ctx, clauses.copyprivateSyms), clauses.nowait,
+                  /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 LogicalResult SingleOp::verify() {
   // Check for allocate clause restrictions
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyCopyPrivateVarList(*this, getCopyprivateVars(),
-                                  getCopyprivateFuncs());
+  return verifyCopyprivateVarList(*this, getCopyprivateVars(),
+                                  getCopyprivateSyms());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1731,28 +1717,29 @@ void WsloopOp::build(OpBuilder &builder, OperationState &state,
                      ArrayRef<NamedAttribute> attributes) {
   build(builder, state, /*linear_vars=*/ValueRange(),
         /*linear_step_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(),
-        /*reduction_vars_byref=*/nullptr,
-        /*reductions=*/nullptr, /*schedule_val=*/nullptr,
-        /*schedule_chunk_var=*/nullptr, /*schedule_modifier=*/nullptr,
-        /*simd_modifier=*/false, /*nowait=*/false,
-        /*ordered_val=*/nullptr, /*order_val=*/nullptr,
-        /*order_modifier=*/nullptr);
+        /*reduction_byref=*/nullptr, /*reduction_syms=*/nullptr,
+        /*schedule_kind=*/nullptr, /*schedule_chunk=*/nullptr,
+        /*schedule_mod=*/nullptr, /*schedule_simd=*/false, /*nowait=*/false,
+        /*ordered=*/nullptr, /*order=*/nullptr, /*order_mod=*/nullptr,
+        /*allocate_vars=*/{}, /*allocator_vars=*/{}, /*private_vars=*/{},
+        /*private_syms=*/nullptr);
   state.addAttributes(attributes);
 }
 
 void WsloopOp::build(OpBuilder &builder, OperationState &state,
-                     const WsloopClauseOps &clauses) {
+                     const WsloopOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO: Store clauses in op: allocateVars, allocatorVars, privateVars,
-  // privatizers.
+  // privateSyms.
   WsloopOp::build(builder, state, clauses.linearVars, clauses.linearStepVars,
                   clauses.reductionVars,
-                  makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                  makeArrayAttr(ctx, clauses.reductionDeclSymbols),
-                  clauses.scheduleValAttr, clauses.scheduleChunkVar,
-                  clauses.scheduleModAttr, clauses.scheduleSimdAttr,
-                  clauses.nowaitAttr, clauses.orderedAttr, clauses.orderAttr,
-                  clauses.orderModAttr);
+                  makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                  makeArrayAttr(ctx, clauses.reductionSyms),
+                  clauses.scheduleKind, clauses.scheduleChunk,
+                  clauses.scheduleMod, clauses.scheduleSimd, clauses.nowait,
+                  clauses.ordered, clauses.order, clauses.orderMod,
+                  /*allocate_vars=*/{}, /*allocator_vars=*/{},
+                  /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 LogicalResult WsloopOp::verify() {
@@ -1766,8 +1753,8 @@ LogicalResult WsloopOp::verify() {
       return emitError() << "only supported nested wrapper is 'omp.simd'";
   }
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1775,14 +1762,17 @@ LogicalResult WsloopOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void SimdOp::build(OpBuilder &builder, OperationState &state,
-                   const SimdClauseOps &clauses) {
+                   const SimdOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers, reductionVars,
-  // reduceVarByRef, reductionDeclSymbols.
+  // TODO Store clauses in op: linearVars, linearStepVars, privateVars,
+  // privateSyms, reductionVars, reductionByref, reductionSyms.
   SimdOp::build(builder, state, clauses.alignedVars,
-                makeArrayAttr(ctx, clauses.alignmentAttrs), clauses.ifVar,
-                clauses.nontemporalVars, clauses.orderAttr,
-                clauses.orderModAttr, clauses.safelenAttr, clauses.simdlenAttr);
+                makeArrayAttr(ctx, clauses.alignments), clauses.ifVar,
+                /*linear_vars=*/{}, /*linear_step_vars=*/{},
+                clauses.nontemporalVars, clauses.order, clauses.orderMod,
+                /*private_vars=*/{}, /*private_syms=*/nullptr,
+                /*reduction_vars=*/{}, /*reduction_byref=*/nullptr,
+                /*reduction_syms=*/nullptr, clauses.safelen, clauses.simdlen);
 }
 
 LogicalResult SimdOp::verify() {
@@ -1792,8 +1782,7 @@ LogicalResult SimdOp::verify() {
            << "simdlen clause and safelen clause are both present, but the "
               "simdlen value is not less than or equal to safelen value";
 
-  if (verifyAlignedClause(*this, getAlignmentValues(), getAlignedVars())
-          .failed())
+  if (verifyAlignedClause(*this, getAlignments(), getAlignedVars()).failed())
     return failure();
 
   if (verifyNontemporalClause(*this, getNontemporalVars()).failed())
@@ -1813,20 +1802,20 @@ LogicalResult SimdOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void DistributeOp::build(OpBuilder &builder, OperationState &state,
-                         const DistributeClauseOps &clauses) {
-  // TODO Store clauses in op: privateVars, privatizers.
-  DistributeOp::build(builder, state, clauses.distScheduleStaticAttr,
-                      clauses.distScheduleChunkSizeVar, clauses.allocateVars,
-                      clauses.allocatorVars, clauses.orderAttr,
-                      clauses.orderModAttr);
+                         const DistributeOperands &clauses) {
+  // TODO Store clauses in op: privateVars, privateSyms.
+  DistributeOp::build(builder, state, clauses.distScheduleStatic,
+                      clauses.distScheduleChunkSize, clauses.allocateVars,
+                      clauses.allocatorVars, clauses.order, clauses.orderMod,
+                      /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 LogicalResult DistributeOp::verify() {
-  if (this->getChunkSize() && !this->getDistScheduleStatic())
+  if (this->getDistScheduleChunkSize() && !this->getDistScheduleStatic())
     return emitOpError() << "chunk size set without "
                             "dist_schedule_static being present";
 
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
@@ -1942,26 +1931,26 @@ LogicalResult DeclareReductionOp::verifyRegions() {
 //===----------------------------------------------------------------------===//
 
 void TaskOp::build(OpBuilder &builder, OperationState &state,
-                   const TaskClauseOps &clauses) {
+                   const TaskOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
-  TaskOp::build(
-      builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
-      clauses.mergeableAttr, clauses.inReductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.inReductionVarsByRef),
-      makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.priorityVar,
-      makeArrayAttr(ctx, clauses.dependTypeAttrs), clauses.dependVars,
-      clauses.allocateVars, clauses.allocatorVars);
+  // TODO Store clauses in op: privateVars, privateSyms.
+  TaskOp::build(builder, state, clauses.ifVar, clauses.final, clauses.untied,
+                clauses.mergeable, clauses.inReductionVars,
+                makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
+                makeArrayAttr(ctx, clauses.inReductionSyms), clauses.priority,
+                makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
+                clauses.allocateVars, clauses.allocatorVars,
+                /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 LogicalResult TaskOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars)
              ? verifyDependVars
-             : verifyReductionVarList(*this, getInReductions(),
+             : verifyReductionVarList(*this, getInReductionSyms(),
                                       getInReductionVars(),
-                                      getInReductionVarsByref());
+                                      getInReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1969,19 +1958,18 @@ LogicalResult TaskOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TaskgroupOp::build(OpBuilder &builder, OperationState &state,
-                        const TaskgroupClauseOps &clauses) {
+                        const TaskgroupOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TaskgroupOp::build(
-      builder, state, clauses.taskReductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.taskReductionVarsByRef),
-      makeArrayAttr(ctx, clauses.taskReductionDeclSymbols),
-      clauses.allocateVars, clauses.allocatorVars);
+  TaskgroupOp::build(builder, state, clauses.taskReductionVars,
+                     makeDenseBoolArrayAttr(ctx, clauses.taskReductionByref),
+                     makeArrayAttr(ctx, clauses.taskReductionSyms),
+                     clauses.allocateVars, clauses.allocatorVars);
 }
 
 LogicalResult TaskgroupOp::verify() {
-  return verifyReductionVarList(*this, getTaskReductions(),
+  return verifyReductionVarList(*this, getTaskReductionSyms(),
                                 getTaskReductionVars(),
-                                getTaskReductionVarsByref());
+                                getTaskReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1989,18 +1977,19 @@ LogicalResult TaskgroupOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TaskloopOp::build(OpBuilder &builder, OperationState &state,
-                       const TaskloopClauseOps &clauses) {
+                       const TaskloopOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
-  TaskloopOp::build(
-      builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
-      clauses.mergeableAttr, clauses.inReductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.inReductionVarsByRef),
-      makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.reductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-      makeArrayAttr(ctx, clauses.reductionDeclSymbols), clauses.priorityVar,
-      clauses.allocateVars, clauses.allocatorVars, clauses.grainsizeVar,
-      clauses.numTasksVar, clauses.nogroupAttr);
+  // TODO Store clauses in op: privateVars, privateSyms.
+  TaskloopOp::build(builder, state, clauses.ifVar, clauses.final,
+                    clauses.untied, clauses.mergeable, clauses.inReductionVars,
+                    makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
+                    makeArrayAttr(ctx, clauses.inReductionSyms),
+                    clauses.reductionVars,
+                    makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                    makeArrayAttr(ctx, clauses.reductionSyms), clauses.priority,
+                    clauses.allocateVars, clauses.allocatorVars,
+                    clauses.grainsize, clauses.numTasks, clauses.nogroup,
+                    /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 SmallVector<Value> TaskloopOp::getAllReductionVars() {
@@ -2012,14 +2001,14 @@ SmallVector<Value> TaskloopOp::getAllReductionVars() {
 }
 
 LogicalResult TaskloopOp::verify() {
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
-  if (failed(verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                    getReductionVarsByref())) ||
-      failed(verifyReductionVarList(*this, getInReductions(),
+  if (failed(verifyReductionVarList(*this, getReductionSyms(),
+                                    getReductionVars(), getReductionByref())) ||
+      failed(verifyReductionVarList(*this, getInReductionSyms(),
                                     getInReductionVars(),
-                                    getInReductionVarsByref())))
+                                    getInReductionByref())))
     return failure();
 
   if (!getReductionVars().empty() && getNogroup())
@@ -2031,7 +2020,7 @@ LogicalResult TaskloopOp::verify() {
                        "and an in_reduction clause");
   }
 
-  if (getGrainSize() && getNumTasks()) {
+  if (getGrainsize() && getNumTasks()) {
     return emitError(
         "the grainsize clause and num_tasks clause are mutually exclusive and "
         "may not appear on the same taskloop directive");
@@ -2072,7 +2061,7 @@ ParseResult LoopNestOp::parse(OpAsmParser &parser, OperationState &result) {
 
   // Parse "inclusive" flag.
   if (succeeded(parser.parseOptionalKeyword("inclusive")))
-    result.addAttribute("inclusive",
+    result.addAttribute("loop_inclusive",
                         UnitAttr::get(parser.getBuilder().getContext()));
 
   // Parse step values.
@@ -2099,28 +2088,29 @@ ParseResult LoopNestOp::parse(OpAsmParser &parser, OperationState &result) {
 void LoopNestOp::print(OpAsmPrinter &p) {
   Region &region = getRegion();
   auto args = region.getArguments();
-  p << " (" << args << ") : " << args[0].getType() << " = (" << getLowerBound()
-    << ") to (" << getUpperBound() << ") ";
-  if (getInclusive())
+  p << " (" << args << ") : " << args[0].getType() << " = ("
+    << getLoopLowerBounds() << ") to (" << getLoopUpperBounds() << ") ";
+  if (getLoopInclusive())
     p << "inclusive ";
-  p << "step (" << getStep() << ") ";
+  p << "step (" << getLoopSteps() << ") ";
   p.printRegion(region, /*printEntryBlockArgs=*/false);
 }
 
 void LoopNestOp::build(OpBuilder &builder, OperationState &state,
-                       const LoopNestClauseOps &clauses) {
-  LoopNestOp::build(builder, state, clauses.loopLBVar, clauses.loopUBVar,
-                    clauses.loopStepVar, clauses.loopInclusiveAttr);
+                       const LoopNestOperands &clauses) {
+  LoopNestOp::build(builder, state, clauses.loopLowerBounds,
+                    clauses.loopUpperBounds, clauses.loopSteps,
+                    clauses.loopInclusive);
 }
 
 LogicalResult LoopNestOp::verify() {
-  if (getLowerBound().empty())
+  if (getLoopLowerBounds().empty())
     return emitOpError() << "must represent at least one loop";
 
-  if (getLowerBound().size() != getIVs().size())
+  if (getLoopLowerBounds().size() != getIVs().size())
     return emitOpError() << "number of range arguments and IVs do not match";
 
-  for (auto [lb, iv] : llvm::zip_equal(getLowerBound(), getIVs())) {
+  for (auto [lb, iv] : llvm::zip_equal(getLoopLowerBounds(), getIVs())) {
     if (lb.getType() != iv.getType())
       return emitOpError()
              << "range argument type does not match corresponding IV type";
@@ -2152,13 +2142,12 @@ void LoopNestOp::gatherWrappers(
 //===----------------------------------------------------------------------===//
 
 void CriticalDeclareOp::build(OpBuilder &builder, OperationState &state,
-                              const CriticalClauseOps &clauses) {
-  CriticalDeclareOp::build(builder, state, clauses.criticalNameAttr,
-                           clauses.hintAttr);
+                              const CriticalDeclareOperands &clauses) {
+  CriticalDeclareOp::build(builder, state, clauses.symName, clauses.hint);
 }
 
 LogicalResult CriticalDeclareOp::verify() {
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 LogicalResult CriticalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
@@ -2193,7 +2182,7 @@ static LogicalResult verifyOrderedParent(Operation &op) {
 
   Operation *wrapper = loopOp->getParentOp();
   if (auto wsloopOp = dyn_cast<WsloopOp>(wrapper)) {
-    IntegerAttr orderedAttr = wsloopOp.getOrderedValAttr();
+    IntegerAttr orderedAttr = wsloopOp.getOrderedAttr();
     if (!orderedAttr)
       return op.emitOpError() << "the enclosing worksharing-loop region must "
                                  "have an ordered clause";
@@ -2213,9 +2202,9 @@ static LogicalResult verifyOrderedParent(Operation &op) {
 }
 
 void OrderedOp::build(OpBuilder &builder, OperationState &state,
-                      const OrderedOpClauseOps &clauses) {
-  OrderedOp::build(builder, state, clauses.doacrossDependTypeAttr,
-                   clauses.doacrossNumLoopsAttr, clauses.doacrossVectorVars);
+                      const OrderedOperands &clauses) {
+  OrderedOp::build(builder, state, clauses.doacrossDependType,
+                   clauses.doacrossNumLoops, clauses.doacrossDependVars);
 }
 
 LogicalResult OrderedOp::verify() {
@@ -2223,7 +2212,7 @@ LogicalResult OrderedOp::verify() {
     return failure();
 
   auto wrapper = (*this)->getParentOfType<WsloopOp>();
-  if (!wrapper || *wrapper.getOrderedVal() != *getNumLoopsVal())
+  if (!wrapper || *wrapper.getOrdered() != *getDoacrossNumLoops())
     return emitOpError() << "number of variables in depend clause does not "
                          << "match number of iteration variables in the "
                          << "doacross loop";
@@ -2232,13 +2221,13 @@ LogicalResult OrderedOp::verify() {
 }
 
 void OrderedRegionOp::build(OpBuilder &builder, OperationState &state,
-                            const OrderedRegionClauseOps &clauses) {
-  OrderedRegionOp::build(builder, state, clauses.parLevelSimdAttr);
+                            const OrderedRegionOperands &clauses) {
+  OrderedRegionOp::build(builder, state, clauses.parLevelSimd);
 }
 
 LogicalResult OrderedRegionOp::verify() {
   // TODO: The code generation for ordered simd directive is not supported yet.
-  if (getSimd())
+  if (getParLevelSimd())
     return failure();
 
   return verifyOrderedParent(**this);
@@ -2249,9 +2238,10 @@ LogicalResult OrderedRegionOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TaskwaitOp::build(OpBuilder &builder, OperationState &state,
-                       const TaskwaitClauseOps &clauses) {
-  // TODO Store clauses in op: dependTypeAttrs, dependVars, nowaitAttr.
-  TaskwaitOp::build(builder, state);
+                       const TaskwaitOperands &clauses) {
+  // TODO Store clauses in op: dependKinds, dependVars, nowait.
+  TaskwaitOp::build(builder, state, /*depend_kinds=*/nullptr,
+                    /*depend_vars=*/{}, /*nowait=*/nullptr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2262,14 +2252,14 @@ LogicalResult AtomicReadOp::verify() {
   if (verifyCommon().failed())
     return mlir::failure();
 
-  if (auto mo = getMemoryOrderVal()) {
+  if (auto mo = getMemoryOrder()) {
     if (*mo == ClauseMemoryOrderKind::Acq_rel ||
         *mo == ClauseMemoryOrderKind::Release) {
       return emitError(
           "memory-order must not be acq_rel or release for atomic reads");
     }
   }
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2280,14 +2270,14 @@ LogicalResult AtomicWriteOp::verify() {
   if (verifyCommon().failed())
     return mlir::failure();
 
-  if (auto mo = getMemoryOrderVal()) {
+  if (auto mo = getMemoryOrder()) {
     if (*mo == ClauseMemoryOrderKind::Acq_rel ||
         *mo == ClauseMemoryOrderKind::Acquire) {
       return emitError(
           "memory-order must not be acq_rel or acquire for atomic writes");
     }
   }
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2301,9 +2291,8 @@ LogicalResult AtomicUpdateOp::canonicalize(AtomicUpdateOp op,
     return success();
   }
   if (Value writeVal = op.getWriteOpVal()) {
-    rewriter.replaceOpWithNewOp<AtomicWriteOp>(op, op.getX(), writeVal,
-                                               op.getHintValAttr(),
-                                               op.getMemoryOrderValAttr());
+    rewriter.replaceOpWithNewOp<AtomicWriteOp>(
+        op, op.getX(), writeVal, op.getHintAttr(), op.getMemoryOrderAttr());
     return success();
   }
   return failure();
@@ -2313,7 +2302,7 @@ LogicalResult AtomicUpdateOp::verify() {
   if (verifyCommon().failed())
     return mlir::failure();
 
-  if (auto mo = getMemoryOrderVal()) {
+  if (auto mo = getMemoryOrder()) {
     if (*mo == ClauseMemoryOrderKind::Acq_rel ||
         *mo == ClauseMemoryOrderKind::Acquire) {
       return emitError(
@@ -2321,7 +2310,7 @@ LogicalResult AtomicUpdateOp::verify() {
     }
   }
 
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 LogicalResult AtomicUpdateOp::verifyRegions() { return verifyRegionsCommon(); }
@@ -2349,19 +2338,19 @@ AtomicUpdateOp AtomicCaptureOp::getAtomicUpdateOp() {
 }
 
 LogicalResult AtomicCaptureOp::verify() {
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 LogicalResult AtomicCaptureOp::verifyRegions() {
   if (verifyRegionsCommon().failed())
     return mlir::failure();
 
-  if (getFirstOp()->getAttr("hint_val") || getSecondOp()->getAttr("hint_val"))
+  if (getFirstOp()->getAttr("hint") || getSecondOp()->getAttr("hint"))
     return emitOpError(
         "operations inside capture region must not have hint clause");
 
-  if (getFirstOp()->getAttr("memory_order_val") ||
-      getSecondOp()->getAttr("memory_order_val"))
+  if (getFirstOp()->getAttr("memory_order") ||
+      getSecondOp()->getAttr("memory_order"))
     return emitOpError(
         "operations inside capture region must not have memory_order clause");
   return success();
@@ -2372,13 +2361,12 @@ LogicalResult AtomicCaptureOp::verifyRegions() {
 //===----------------------------------------------------------------------===//
 
 void CancelOp::build(OpBuilder &builder, OperationState &state,
-                     const CancelClauseOps &clauses) {
-  CancelOp::build(builder, state, clauses.cancelDirectiveNameAttr,
-                  clauses.ifVar);
+                     const CancelOperands &clauses) {
+  CancelOp::build(builder, state, clauses.cancelDirective, clauses.ifVar);
 }
 
 LogicalResult CancelOp::verify() {
-  ClauseCancellationConstructType cct = getCancellationConstructTypeVal();
+  ClauseCancellationConstructType cct = getCancelDirective();
   Operation *parentOp = (*this)->getParentOp();
 
   if (!parentOp) {
@@ -2404,7 +2392,7 @@ LogicalResult CancelOp::verify() {
       return emitError() << "A worksharing construct that is canceled "
                          << "must not have a nowait clause";
     }
-    if (wsloopOp.getOrderedValAttr()) {
+    if (wsloopOp.getOrderedAttr()) {
       return emitError() << "A worksharing construct that is canceled "
                          << "must not have an ordered clause";
     }
@@ -2429,12 +2417,12 @@ LogicalResult CancelOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void CancellationPointOp::build(OpBuilder &builder, OperationState &state,
-                                const CancellationPointClauseOps &clauses) {
-  CancellationPointOp::build(builder, state, clauses.cancelDirectiveNameAttr);
+                                const CancellationPointOperands &clauses) {
+  CancellationPointOp::build(builder, state, clauses.cancelDirective);
 }
 
 LogicalResult CancellationPointOp::verify() {
-  ClauseCancellationConstructType cct = getCancellationConstructTypeVal();
+  ClauseCancellationConstructType cct = getCancelDirective();
   Operation *parentOp = (*this)->getParentOp();
 
   if (!parentOp) {
@@ -2574,8 +2562,8 @@ LogicalResult PrivateClauseOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void MaskedOp::build(OpBuilder &builder, OperationState &state,
-                     const MaskedClauseOps &clauses) {
-  MaskedOp::build(builder, state, clauses.filteredThreadIdVar);
+                     const MaskedOperands &clauses) {
+  MaskedOp::build(builder, state, clauses.filteredThreadId);
 }
 
 #define GET_ATTRDEF_CLASSES
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8b031deca89314..ddee1178386979 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -362,9 +362,9 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
     auto criticalDeclareOp =
         SymbolTable::lookupNearestSymbolFrom<omp::CriticalDeclareOp>(criticalOp,
                                                                      symbolRef);
-    hint = llvm::ConstantInt::get(
-        llvm::Type::getInt32Ty(llvmContext),
-        static_cast<int>(criticalDeclareOp.getHintVal()));
+    hint =
+        llvm::ConstantInt::get(llvm::Type::getInt32Ty(llvmContext),
+                               static_cast<int>(criticalDeclareOp.getHint()));
   }
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createCritical(
       ompLoc, bodyGenCB, finiCB, criticalOp.getName().value_or(""), hint));
@@ -376,7 +376,7 @@ template <typename T>
 static void
 collectReductionDecls(T loop,
                       SmallVectorImpl<omp::DeclareReductionOp> &reductions) {
-  std::optional<ArrayAttr> attr = loop.getReductions();
+  std::optional<ArrayAttr> attr = loop.getReductionSyms();
   if (!attr)
     return;
 
@@ -534,11 +534,11 @@ convertOmpOrdered(Operation &opInst, llvm::IRBuilderBase &builder,
                   LLVM::ModuleTranslation &moduleTranslation) {
   auto orderedOp = cast<omp::OrderedOp>(opInst);
 
-  omp::ClauseDepend dependType = *orderedOp.getDependTypeVal();
+  omp::ClauseDepend dependType = *orderedOp.getDoacrossDependType();
   bool isDependSource = dependType == omp::ClauseDepend::dependsource;
-  unsigned numLoops = *orderedOp.getNumLoopsVal();
+  unsigned numLoops = *orderedOp.getDoacrossNumLoops();
   SmallVector<llvm::Value *> vecValues =
-      moduleTranslation.lookupValues(orderedOp.getDependVecVars());
+      moduleTranslation.lookupValues(orderedOp.getDoacrossDependVars());
 
   size_t indexVecValues = 0;
   while (indexVecValues < vecValues.size()) {
@@ -566,7 +566,7 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
   auto orderedRegionOp = cast<omp::OrderedRegionOp>(opInst);
 
   // TODO: The code generation for ordered simd directive is not supported yet.
-  if (orderedRegionOp.getSimd())
+  if (orderedRegionOp.getParLevelSimd())
     return failure();
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
@@ -588,7 +588,7 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(
       moduleTranslation.getOpenMPBuilder()->createOrderedThreadsSimd(
-          ompLoc, bodyGenCB, finiCB, !orderedRegionOp.getSimd()));
+          ompLoc, bodyGenCB, finiCB, !orderedRegionOp.getParLevelSimd()));
   return bodyGenStatus;
 }
 
@@ -837,11 +837,11 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
   // TODO: Support the following clauses: private, firstprivate, lastprivate,
   // allocate
   if (!sectionsOp.getAllocateVars().empty() ||
-      !sectionsOp.getAllocatorsVars().empty())
-    return emitError(sectionsOp.getLoc())
-           << "allocate clause is not supported for sections construct";
+      !sectionsOp.getAllocatorVars().empty() ||
+      !sectionsOp.getPrivateVars().empty() || sectionsOp.getPrivateSyms())
+    return opInst.emitError("unhandled clauses for translation to LLVM IR");
 
-  llvm::ArrayRef<bool> isByRef = getIsByRef(sectionsOp.getReductionVarsByref());
+  llvm::ArrayRef<bool> isByRef = getIsByRef(sectionsOp.getReductionByref());
   assert(isByRef.size() == sectionsOp.getNumReductionVars());
 
   SmallVector<omp::DeclareReductionOp> reductionDecls;
@@ -945,6 +945,9 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   LogicalResult bodyGenStatus = success();
+  if (!singleOp.getPrivateVars().empty() || singleOp.getPrivateSyms())
+    return singleOp.emitError("unhandled clauses for translation to LLVM IR");
+
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
     builder.restoreIP(codegenIP);
     convertOmpOpRegions(singleOp.getRegion(), "omp.single.region", builder,
@@ -954,7 +957,7 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
 
   // Handle copyprivate
   Operation::operand_range cpVars = singleOp.getCopyprivateVars();
-  std::optional<ArrayAttr> cpFuncs = singleOp.getCopyprivateFuncs();
+  std::optional<ArrayAttr> cpFuncs = singleOp.getCopyprivateSyms();
   llvm::SmallVector<llvm::Value *> llvmCPVars;
   llvm::SmallVector<llvm::Function *> llvmCPFuncs;
   for (size_t i = 0, e = cpVars.size(); i < e; ++i) {
@@ -976,7 +979,8 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
                 LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   LogicalResult bodyGenStatus = success();
-  if (!op.getAllocatorsVars().empty() || op.getReductions())
+  if (!op.getAllocatorVars().empty() || op.getReductionSyms() ||
+      !op.getPrivateVars().empty() || op.getPrivateSyms())
     return op.emitError("unhandled clauses for translation to LLVM IR");
 
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
@@ -1000,8 +1004,8 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
     threadLimit = moduleTranslation.lookupValue(threadLimitVar);
 
   llvm::Value *ifExpr = nullptr;
-  if (Value ifExprVar = op.getIfExpr())
-    ifExpr = moduleTranslation.lookupValue(ifExprVar);
+  if (Value ifVar = op.getIfExpr())
+    ifExpr = moduleTranslation.lookupValue(ifVar);
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTeams(
@@ -1010,12 +1014,12 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
 }
 
 static void
-buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
+buildDependData(std::optional<ArrayAttr> dependKinds, OperandRange dependVars,
                 LLVM::ModuleTranslation &moduleTranslation,
                 SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
   if (dependVars.empty())
     return;
-  for (auto dep : llvm::zip(dependVars, depends->getValue())) {
+  for (auto dep : llvm::zip(dependVars, dependKinds->getValue())) {
     llvm::omp::RTLDependenceKindTy type;
     switch (
         cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
@@ -1042,8 +1046,9 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   LogicalResult bodyGenStatus = success();
   if (taskOp.getUntiedAttr() || taskOp.getMergeableAttr() ||
-      taskOp.getInReductions() || taskOp.getPriority() ||
-      !taskOp.getAllocateVars().empty()) {
+      taskOp.getInReductionSyms() || taskOp.getPriority() ||
+      !taskOp.getAllocateVars().empty() || !taskOp.getPrivateVars().empty() ||
+      taskOp.getPrivateSyms()) {
     return taskOp.emitError("unhandled clauses for translation to LLVM IR");
   }
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
@@ -1058,7 +1063,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   };
 
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
+  buildDependData(taskOp.getDependKinds(), taskOp.getDependVars(),
                   moduleTranslation, dds);
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
@@ -1066,7 +1071,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTask(
       ompLoc, allocaIP, bodyCB, !taskOp.getUntied(),
-      moduleTranslation.lookupValue(taskOp.getFinalExpr()),
+      moduleTranslation.lookupValue(taskOp.getFinal()),
       moduleTranslation.lookupValue(taskOp.getIfExpr()), dds));
   return bodyGenStatus;
 }
@@ -1091,30 +1096,47 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
       ompLoc, allocaIP, bodyCB));
   return bodyGenStatus;
 }
+
+static LogicalResult
+convertOmpTaskwaitOp(omp::TaskwaitOp twOp, llvm::IRBuilderBase &builder,
+                     LLVM::ModuleTranslation &moduleTranslation) {
+  if (!twOp.getDependVars().empty() || twOp.getDependKinds() ||
+      twOp.getNowait())
+    return twOp.emitError("unhandled clauses for translation to LLVM IR");
+
+  moduleTranslation.getOpenMPBuilder()->createTaskwait(builder.saveIP());
+  return success();
+}
+
 /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
   auto wsloopOp = cast<omp::WsloopOp>(opInst);
+  if (!wsloopOp.getAllocateVars().empty() ||
+      !wsloopOp.getAllocatorVars().empty() ||
+      !wsloopOp.getPrivateVars().empty() || wsloopOp.getPrivateSyms())
+    return opInst.emitError("unhandled clauses for translation to LLVM IR");
+
   // FIXME: Here any other nested wrappers (e.g. omp.simd) are skipped, so
   // codegen for composite constructs like 'DO/FOR SIMD' will be the same as for
   // 'DO/FOR'.
   auto loopOp = cast<omp::LoopNestOp>(wsloopOp.getWrappedLoop());
 
-  llvm::ArrayRef<bool> isByRef = getIsByRef(wsloopOp.getReductionVarsByref());
+  llvm::ArrayRef<bool> isByRef = getIsByRef(wsloopOp.getReductionByref());
   assert(isByRef.size() == wsloopOp.getNumReductionVars());
 
   // Static is the default.
   auto schedule =
-      wsloopOp.getScheduleVal().value_or(omp::ClauseScheduleKind::Static);
+      wsloopOp.getScheduleKind().value_or(omp::ClauseScheduleKind::Static);
 
   // Find the loop configuration.
-  llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[0]);
+  llvm::Value *step = moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]);
   llvm::Type *ivType = step->getType();
   llvm::Value *chunk = nullptr;
-  if (wsloopOp.getScheduleChunkVar()) {
+  if (wsloopOp.getScheduleChunk()) {
     llvm::Value *chunkVar =
-        moduleTranslation.lookupValue(wsloopOp.getScheduleChunkVar());
+        moduleTranslation.lookupValue(wsloopOp.getScheduleChunk());
     chunk = builder.CreateSExtOrTrunc(chunkVar, ivType);
   }
 
@@ -1178,10 +1200,10 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   for (unsigned i = 0, e = loopOp.getNumLoops(); i < e; ++i) {
     llvm::Value *lowerBound =
-        moduleTranslation.lookupValue(loopOp.getLowerBound()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[i]);
     llvm::Value *upperBound =
-        moduleTranslation.lookupValue(loopOp.getUpperBound()[i]);
-    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[i]);
+    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getLoopSteps()[i]);
 
     // Make sure loop trip count are emitted in the preheader of the outermost
     // loop at the latest so that they are all available for the new collapsed
@@ -1194,7 +1216,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
     }
     loopInfos.push_back(ompBuilder->createCanonicalLoop(
         loc, bodyGen, lowerBound, upperBound, step,
-        /*IsSigned=*/true, loopOp.getInclusive(), computeIP));
+        /*IsSigned=*/true, loopOp.getLoopInclusive(), computeIP));
 
     if (failed(bodyGenStatus))
       return failure();
@@ -1209,16 +1231,15 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
 
   // TODO: Handle doacross loops when the ordered clause has a parameter.
-  bool isOrdered = wsloopOp.getOrderedVal().has_value();
-  std::optional<omp::ScheduleModifier> scheduleModifier =
-      wsloopOp.getScheduleModifier();
-  bool isSimd = wsloopOp.getSimdModifier();
+  bool isOrdered = wsloopOp.getOrdered().has_value();
+  std::optional<omp::ScheduleModifier> scheduleMod = wsloopOp.getScheduleMod();
+  bool isSimd = wsloopOp.getScheduleSimd();
 
   ompBuilder->applyWorkshareLoop(
       ompLoc.DL, loopInfo, allocaIP, !wsloopOp.getNowait(),
       convertToScheduleKind(schedule), chunk, isSimd,
-      scheduleModifier == omp::ScheduleModifier::monotonic,
-      scheduleModifier == omp::ScheduleModifier::nonmonotonic, isOrdered);
+      scheduleMod == omp::ScheduleModifier::monotonic,
+      scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered);
 
   // Continue building IR after the loop. Note that the LoopInfo returned by
   // `collapseLoops` points inside the outermost loop and is intended for
@@ -1275,7 +1296,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   OmpParallelOpConversionManager raii(opInst);
-  ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionVarsByref());
+  ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionByref());
   assert(isByRef.size() == opInst.getNumReductionVars());
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
@@ -1420,11 +1441,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     auto [privVar, privatizerClone] =
         [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
       if (!opInst.getPrivateVars().empty()) {
-        auto privVars = opInst.getPrivateVars();
-        auto privatizers = opInst.getPrivatizers();
+        auto privateVars = opInst.getPrivateVars();
+        auto privateSyms = opInst.getPrivateSyms();
 
         for (auto [privVar, privatizerAttr] :
-             llvm::zip_equal(privVars, *privatizers)) {
+             llvm::zip_equal(privateVars, *privateSyms)) {
           // Find the MLIR private variable corresponding to the LLVM value
           // being privatized.
           llvm::Value *llvmPrivVar = moduleTranslation.lookupValue(privVar);
@@ -1564,13 +1585,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   };
 
   llvm::Value *ifCond = nullptr;
-  if (auto ifExprVar = opInst.getIfExpr())
-    ifCond = moduleTranslation.lookupValue(ifExprVar);
+  if (auto ifVar = opInst.getIfExpr())
+    ifCond = moduleTranslation.lookupValue(ifVar);
   llvm::Value *numThreads = nullptr;
-  if (auto numThreadsVar = opInst.getNumThreadsVar())
+  if (auto numThreadsVar = opInst.getNumThreads())
     numThreads = moduleTranslation.lookupValue(numThreadsVar);
   auto pbKind = llvm::omp::OMP_PROC_BIND_default;
-  if (auto bind = opInst.getProcBindVal())
+  if (auto bind = opInst.getProcBindKind())
     pbKind = getProcBindKind(*bind);
   // TODO: Is the Parallel construct cancellable?
   bool isCancellable = false;
@@ -1608,6 +1629,12 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   auto simdOp = cast<omp::SimdOp>(opInst);
   auto loopOp = cast<omp::LoopNestOp>(simdOp.getWrappedLoop());
 
+  if (!simdOp.getLinearVars().empty() || !simdOp.getLinearStepVars().empty() ||
+      !simdOp.getPrivateVars().empty() || simdOp.getPrivateSyms() ||
+      !simdOp.getReductionVars().empty() || simdOp.getReductionByref() ||
+      simdOp.getReductionSyms())
+    return opInst.emitError("unhandled clauses for translation to LLVM IR");
+
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
   // Generator of the canonical loop body.
@@ -1643,10 +1670,10 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   for (unsigned i = 0, e = loopOp.getNumLoops(); i < e; ++i) {
     llvm::Value *lowerBound =
-        moduleTranslation.lookupValue(loopOp.getLowerBound()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[i]);
     llvm::Value *upperBound =
-        moduleTranslation.lookupValue(loopOp.getUpperBound()[i]);
-    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[i]);
+    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getLoopSteps()[i]);
 
     // Make sure loop trip count are emitted in the preheader of the outermost
     // loop at the latest so that they are all available for the new collapsed
@@ -1680,7 +1707,7 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
     safelen = builder.getInt64(safelenVar.value());
 
   llvm::MapVector<llvm::Value *, llvm::Value *> alignedVars;
-  llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrderVal());
+  llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrder());
   ompBuilder->applySimd(loopInfo, alignedVars,
                         simdOp.getIfExpr()
                             ? moduleTranslation.lookupValue(simdOp.getIfExpr())
@@ -1722,7 +1749,7 @@ convertOmpAtomicRead(Operation &opInst, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
-  llvm::AtomicOrdering AO = convertAtomicOrdering(readOp.getMemoryOrderVal());
+  llvm::AtomicOrdering AO = convertAtomicOrdering(readOp.getMemoryOrder());
   llvm::Value *x = moduleTranslation.lookupValue(readOp.getX());
   llvm::Value *v = moduleTranslation.lookupValue(readOp.getV());
 
@@ -1743,7 +1770,7 @@ convertOmpAtomicWrite(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
-  llvm::AtomicOrdering ao = convertAtomicOrdering(writeOp.getMemoryOrderVal());
+  llvm::AtomicOrdering ao = convertAtomicOrdering(writeOp.getMemoryOrder());
   llvm::Value *expr = moduleTranslation.lookupValue(writeOp.getExpr());
   llvm::Value *dest = moduleTranslation.lookupValue(writeOp.getX());
   llvm::Type *ty = moduleTranslation.convertType(writeOp.getExpr().getType());
@@ -1811,7 +1838,7 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst,
                                                       /*isVolatile=*/false};
 
   llvm::AtomicOrdering atomicOrdering =
-      convertAtomicOrdering(opInst.getMemoryOrderVal());
+      convertAtomicOrdering(opInst.getMemoryOrder());
 
   // Generate update code.
   LogicalResult updateGenStatus = success();
@@ -1903,7 +1930,7 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp,
                                                       /*isVolatile=*/false};
 
   llvm::AtomicOrdering atomicOrdering =
-      convertAtomicOrdering(atomicCaptureOp.getMemoryOrderVal());
+      convertAtomicOrdering(atomicCaptureOp.getMemoryOrder());
 
   LogicalResult updateGenStatus = success();
   auto updateFn = [&](llvm::Value *atomicx,
@@ -2166,12 +2193,11 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
   return builder.getInt64(dl.getTypeSizeInBits(type) / 8);
 }
 
-void collectMapDataFromMapOperands(MapInfoData &mapData,
-                                   llvm::SmallVectorImpl<Value> &mapOperands,
-                                   LLVM::ModuleTranslation &moduleTranslation,
-                                   DataLayout &dl,
-                                   llvm::IRBuilderBase &builder) {
-  for (mlir::Value mapValue : mapOperands) {
+void collectMapDataFromMapVars(MapInfoData &mapData,
+                               llvm::SmallVectorImpl<Value> &mapVars,
+                               LLVM::ModuleTranslation &moduleTranslation,
+                               DataLayout &dl, llvm::IRBuilderBase &builder) {
+  for (mlir::Value mapValue : mapVars) {
     if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
             mapValue.getDefiningOp())) {
       mlir::Value offloadPtr =
@@ -2211,7 +2237,7 @@ void collectMapDataFromMapOperands(MapInfoData &mapData,
       // TODO: May require some further additions to support nested record
       // types, i.e. member maps that can have member maps.
       mapData.IsAMember.push_back(false);
-      for (mlir::Value mapValue : mapOperands) {
+      for (mlir::Value mapValue : mapVars) {
         if (auto map = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
                 mapValue.getDefiningOp())) {
           for (auto member : map.getMembers()) {
@@ -2689,8 +2715,8 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
                         DataLayout &dl,
                         llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo,
                         MapInfoData &mapData,
-                        const SmallVector<Value> &devPtrOperands = {},
-                        const SmallVector<Value> &devAddrOperands = {},
+                        const SmallVector<Value> &useDevicePtrVars = {},
+                        const SmallVector<Value> &useDeviceAddrVars = {},
                         bool isTargetParams = false) {
   // We wish to modify some of the methods in which arguments are
   // passed based on their capture type by the target region, this can
@@ -2748,13 +2774,13 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     return false;
   };
 
-  auto addDevInfos = [&, fail](auto devOperands, auto devOpType) -> void {
-    for (const auto &devOp : devOperands) {
+  auto addDevInfos = [&, fail](auto useDeviceVars, auto devOpType) -> void {
+    for (const auto &useDeviceVar : useDeviceVars) {
       // TODO: Only LLVMPointerTypes are handled.
-      if (!isa<LLVM::LLVMPointerType>(devOp.getType()))
+      if (!isa<LLVM::LLVMPointerType>(useDeviceVar.getType()))
         return fail();
 
-      llvm::Value *mapOpValue = moduleTranslation.lookupValue(devOp);
+      llvm::Value *mapOpValue = moduleTranslation.lookupValue(useDeviceVar);
 
       // Check if map info is already present for this entry.
       unsigned infoIndex;
@@ -2767,7 +2793,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
         combinedInfo.Pointers.emplace_back(mapOpValue);
         combinedInfo.DevicePointers.emplace_back(devOpType);
         combinedInfo.Names.emplace_back(
-            LLVM::createMappingInformation(devOp.getLoc(), *ompBuilder));
+            LLVM::createMappingInformation(useDeviceVar.getLoc(), *ompBuilder));
         combinedInfo.Types.emplace_back(
             llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM);
         combinedInfo.Sizes.emplace_back(builder.getInt64(0));
@@ -2775,8 +2801,8 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     }
   };
 
-  addDevInfos(devPtrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
-  addDevInfos(devAddrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Address);
+  addDevInfos(useDevicePtrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
+  addDevInfos(useDeviceAddrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Address);
 }
 
 static LogicalResult
@@ -2784,9 +2810,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                      LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *ifCond = nullptr;
   int64_t deviceID = llvm::omp::OMP_DEVICEID_UNDEF;
-  SmallVector<Value> mapOperands;
-  SmallVector<Value> useDevPtrOperands;
-  SmallVector<Value> useDevAddrOperands;
+  SmallVector<Value> mapVars;
+  SmallVector<Value> useDevicePtrVars;
+  SmallVector<Value> useDeviceAddrVars;
   llvm::omp::RuntimeFunction RTLFn;
   DataLayout DL = DataLayout(op->getParentOfType<ModuleOp>());
 
@@ -2795,8 +2821,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   LogicalResult result =
       llvm::TypeSwitch<Operation *, LogicalResult>(op)
           .Case([&](omp::TargetDataOp dataOp) {
-            if (auto ifExprVar = dataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = dataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = dataOp.getDevice())
               if (auto constOp =
@@ -2804,9 +2830,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                 if (auto intAttr = dyn_cast<IntegerAttr>(constOp.getValue()))
                   deviceID = intAttr.getInt();
 
-            mapOperands = dataOp.getMapOperands();
-            useDevPtrOperands = dataOp.getUseDevicePtr();
-            useDevAddrOperands = dataOp.getUseDeviceAddr();
+            mapVars = dataOp.getMapVars();
+            useDevicePtrVars = dataOp.getUseDevicePtrVars();
+            useDeviceAddrVars = dataOp.getUseDeviceAddrVars();
             return success();
           })
           .Case([&](omp::TargetEnterDataOp enterDataOp) {
@@ -2814,8 +2840,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
               return (LogicalResult)(enterDataOp.emitError(
                   "`nowait` is not supported yet"));
 
-            if (auto ifExprVar = enterDataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = enterDataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = enterDataOp.getDevice())
               if (auto constOp =
@@ -2823,7 +2849,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                 if (auto intAttr = dyn_cast<IntegerAttr>(constOp.getValue()))
                   deviceID = intAttr.getInt();
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_begin_mapper;
-            mapOperands = enterDataOp.getMapOperands();
+            mapVars = enterDataOp.getMapVars();
             return success();
           })
           .Case([&](omp::TargetExitDataOp exitDataOp) {
@@ -2831,8 +2857,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
               return (LogicalResult)(exitDataOp.emitError(
                   "`nowait` is not supported yet"));
 
-            if (auto ifExprVar = exitDataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = exitDataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = exitDataOp.getDevice())
               if (auto constOp =
@@ -2841,7 +2867,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                   deviceID = intAttr.getInt();
 
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_end_mapper;
-            mapOperands = exitDataOp.getMapOperands();
+            mapVars = exitDataOp.getMapVars();
             return success();
           })
           .Case([&](omp::TargetUpdateOp updateDataOp) {
@@ -2849,8 +2875,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
               return (LogicalResult)(updateDataOp.emitError(
                   "`nowait` is not supported yet"));
 
-            if (auto ifExprVar = updateDataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = updateDataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = updateDataOp.getDevice())
               if (auto constOp =
@@ -2859,7 +2885,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                   deviceID = intAttr.getInt();
 
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_update_mapper;
-            mapOperands = updateDataOp.getMapOperands();
+            mapVars = updateDataOp.getMapVars();
             return success();
           })
           .Default([&](Operation *op) {
@@ -2873,8 +2899,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
 
   MapInfoData mapData;
-  collectMapDataFromMapOperands(mapData, mapOperands, moduleTranslation, DL,
-                                builder);
+  collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, DL, builder);
 
   // Fill up the arrays with all the mapped variables.
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfo;
@@ -2883,7 +2908,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
     builder.restoreIP(codeGenIP);
     if (auto dataOp = dyn_cast<omp::TargetDataOp>(op)) {
       genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData,
-                  useDevPtrOperands, useDevAddrOperands);
+                  useDevicePtrVars, useDeviceAddrVars);
     } else {
       genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData);
     }
@@ -2905,7 +2930,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
       if (!info.DevicePtrInfoMap.empty()) {
         builder.restoreIP(codeGenIP);
         unsigned argIndex = 0;
-        for (auto &devPtrOp : useDevPtrOperands) {
+        for (auto &devPtrOp : useDevicePtrVars) {
           llvm::Value *mapOpValue = moduleTranslation.lookupValue(devPtrOp);
           const auto &arg = region.front().getArgument(argIndex);
           moduleTranslation.mapValue(arg,
@@ -2913,7 +2938,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
           argIndex++;
         }
 
-        for (auto &devAddrOp : useDevAddrOperands) {
+        for (auto &devAddrOp : useDeviceAddrVars) {
           llvm::Value *mapOpValue = moduleTranslation.lookupValue(devAddrOp);
           const auto &arg = region.front().getArgument(argIndex);
           auto *LI = builder.CreateLoad(
@@ -3038,6 +3063,18 @@ static bool targetOpSupported(Operation &opInst) {
     return false;
   }
 
+  if (!targetOp.getAllocateVars().empty() ||
+      !targetOp.getAllocatorVars().empty()) {
+    opInst.emitError("Allocate clause not yet supported");
+    return false;
+  }
+
+  if (!targetOp.getInReductionVars().empty() ||
+      targetOp.getInReductionByref() || targetOp.getInReductionSyms()) {
+    opInst.emitError("In reduction clause not yet supported");
+    return false;
+  }
+
   return true;
 }
 
@@ -3200,7 +3237,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   auto targetOp = cast<omp::TargetOp>(opInst);
   auto &targetRegion = targetOp.getRegion();
   DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
-  SmallVector<Value> mapOperands = targetOp.getMapOperands();
+  SmallVector<Value> mapVars = targetOp.getMapVars();
   llvm::Function *llvmOutlinedFn = nullptr;
 
   LogicalResult bodyGenStatus = success();
@@ -3225,7 +3262,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
     builder.restoreIP(codeGenIP);
     unsigned argIndex = 0;
-    for (auto &mapOp : mapOperands) {
+    for (auto &mapOp : mapVars) {
       auto mapInfoOp =
           mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp());
       llvm::Value *mapOpValue =
@@ -3255,8 +3292,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       findAllocaInsertPoint(builder, moduleTranslation);
 
   MapInfoData mapData;
-  collectMapDataFromMapOperands(mapData, mapOperands, moduleTranslation, dl,
-                                builder);
+  collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, dl, builder);
 
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfos;
   auto genMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP)
@@ -3288,7 +3324,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   };
 
   llvm::SmallVector<llvm::Value *, 4> kernelInput;
-  for (size_t i = 0; i < mapOperands.size(); ++i) {
+  for (size_t i = 0; i < mapVars.size(); ++i) {
     // declare target arguments are not passed to kernels as arguments
     // TODO: We currently do not handle cases where a member is explicitly
     // passed in as an argument, this will likley need to be handled in
@@ -3299,7 +3335,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       kernelInput.push_back(mapData.OriginalValue[i]);
   }
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
+  buildDependData(targetOp.getDependKinds(), targetOp.getDependVars(),
                   moduleTranslation, dds);
 
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
@@ -3438,10 +3474,6 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
         ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
         return success();
       })
-      .Case([&](omp::TaskwaitOp) {
-        ompBuilder->createTaskwait(builder.saveIP());
-        return success();
-      })
       .Case([&](omp::TaskyieldOp) {
         ompBuilder->createTaskyield(builder.saveIP());
         return success();
@@ -3509,6 +3541,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
       .Case([&](omp::TaskgroupOp op) {
         return convertOmpTaskgroupOp(op, builder, moduleTranslation);
       })
+      .Case([&](omp::TaskwaitOp op) {
+        return convertOmpTaskwaitOp(op, builder, moduleTranslation);
+      })
       .Case<omp::YieldOp, omp::TerminatorOp, omp::DeclareReductionOp,
             omp::CriticalDeclareOp>([](auto op) {
         // `yield` and `terminator` can be just omitted. The block structure
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 9977dd57e3023b..28542587fb9bb8 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -420,8 +420,8 @@ func.func @omp_simd_aligned_mismatch(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [128],
-      operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [128],
+      operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -435,7 +435,7 @@ func.func @omp_simd_aligned_negative(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [-1, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [-1, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -449,7 +449,7 @@ func.func @omp_simd_unexpected_alignment(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [1, 128]} : () -> ()
+  }) {alignments = [1, 128]} : () -> ()
   return
 }
 
@@ -463,7 +463,7 @@ func.func @omp_simd_aligned_float(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [1.5, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [1.5, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -477,7 +477,7 @@ func.func @omp_simd_aligned_the_same_var(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [1, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -491,7 +491,7 @@ func.func @omp_simd_nontemporal_the_same_var(%arg0 : index,  %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 2>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -839,7 +839,7 @@ func.func @omp_ordered_region3(%x : i32) -> () {
 
 func.func @omp_ordered1(%vec0 : i64) -> () {
   // expected-error @below {{op must be nested inside of a loop}}
-  omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+  omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
   return
 }
 
@@ -849,7 +849,7 @@ func.func @omp_ordered2(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> (
   omp.distribute {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{op must be nested inside of a worksharing, simd or worksharing simd loop}}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
       omp.yield
     }
     omp.terminator
@@ -863,7 +863,7 @@ func.func @omp_ordered3(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> (
   omp.wsloop {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{the enclosing worksharing-loop region must have an ordered clause}}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
       omp.yield
     }
     omp.terminator
@@ -877,7 +877,7 @@ func.func @omp_ordered4(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> (
   omp.wsloop ordered(0) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{the enclosing loop's ordered clause must have a parameter present}}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
       omp.yield
     }
     omp.terminator
@@ -891,7 +891,7 @@ func.func @omp_ordered5(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64, %vec
   omp.wsloop ordered(1) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{number of variables in depend clause does not match number of iteration variables in the doacross loop}}
-      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {doacross_num_loops = 2 : i64}
       omp.yield
     }
     omp.terminator
@@ -1394,7 +1394,7 @@ func.func @omp_teams_allocate(%data_var : memref<i32>) {
     // expected-error @below {{expected equal sizes for allocate and allocator variables}}
     "omp.teams" (%data_var) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 0,0,0,0,1,0,0>} : (memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 0,0,0,0,1,0,0,0>} : (memref<i32>) -> ()
     omp.terminator
   }
   return
@@ -1407,7 +1407,7 @@ func.func @omp_teams_num_teams1(%lb : i32) {
     // expected-error @below {{expected num_teams upper bound to be defined if the lower bound is defined}}
     "omp.teams" (%lb) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 1,0,0,0,0,0,0>} : (i32) -> ()
+    }) {operandSegmentSizes = array<i32: 1,0,0,0,0,0,0,0>} : (i32) -> ()
     omp.terminator
   }
   return
@@ -1432,7 +1432,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,1,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,1,0,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1442,7 +1442,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1557,17 +1557,17 @@ func.func @omp_single(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.single" (%data_var) ({
     omp.barrier
-  }) {operandSegmentSizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0,0>} : (memref<i32>) -> ()
   return
 }
 
 // -----
 
 func.func @omp_single_copyprivate(%data_var : memref<i32>) -> () {
-  // expected-error @below {{inconsistent number of copyPrivate vars (= 1) and functions (= 0), both must be equal}}
+  // expected-error @below {{inconsistent number of copyprivate vars (= 1) and functions (= 0), both must be equal}}
   "omp.single" (%data_var) ({
     omp.barrier
-  }) {operandSegmentSizes = array<i32: 0,0,1>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,1,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1623,7 +1623,7 @@ func.func @omp_task_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.task"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
+    }) {depend_kinds = [], operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
@@ -1820,7 +1820,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1834,7 +1834,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>, reduction_syms = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1847,7 +1847,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (!llvm.ptr) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>, reduction_syms = [@add_f32, @add_f32]} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1861,7 +1861,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {in_reductions = [@add_f32], operandSegmentSizes = array<i32: 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr, !llvm.ptr) -> ()
+  }) {in_reduction_syms = [@add_f32], operandSegmentSizes = array<i32: 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1874,7 +1874,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr) -> ()
+  }) {in_reduction_syms = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1934,7 +1934,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testi64 = "test.i64"() : () -> (i64)
   // expected-error @below {{the grainsize clause and num_tasks clause are mutually exclusive and may not appear on the same taskloop directive}}
-  omp.taskloop grain_size(%testi64: i64) num_tasks(%testi64: i64) {
+  omp.taskloop grainsize(%testi64: i64) num_tasks(%testi64: i64) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
@@ -2001,7 +2001,7 @@ func.func @omp_target_data(%map1: memref<?xi32>) {
 // -----
 
 func.func @omp_target_data() {
-  // expected-error @below {{At least one of map, useDevicePtr, or useDeviceAddr operand must be present}}
+  // expected-error @below {{At least one of map, use_device_ptr_vars, or use_device_addr_vars operand must be present}}
   omp.target_data {}
   return
 }
@@ -2129,7 +2129,7 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.target"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0>} : (memref<i32>) -> ()
+    }) {depend_kinds = [], operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
@@ -2137,7 +2137,7 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
 
 func.func @omp_distribute_schedule(%chunk_size : i32) -> () {
   // expected-error @below {{op chunk size set without dist_schedule_static being present}}
-  "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 1, 0, 0>}> ({
+  "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>}> ({
       "omp.terminator"() : () -> ()
     }) : (i32) -> ()
 }
@@ -2146,7 +2146,7 @@ func.func @omp_distribute_schedule(%chunk_size : i32) -> () {
 
 func.func @omp_distribute_allocate(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
-  "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 0, 1, 0>}> ({
+  "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 0, 1, 0, 0>}> ({
       "omp.terminator"() : () -> ()
     }) : (memref<i32>) -> ()
 }
@@ -2340,7 +2340,7 @@ func.func @undefined_privatizer(%arg0: index) {
 // -----
 func.func @undefined_privatizer(%arg0: !llvm.ptr) {
   // expected-error @below {{inconsistent number of private variables and privatizer op symbols, private vars: 1 vs. privatizer op symbols: 2}}
-  "omp.parallel"(%arg0) <{operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1>, privatizers = [@x.privatizer, @y.privatizer]}> ({
+  "omp.parallel"(%arg0) <{operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1>, private_syms = [@x.privatizer, @y.privatizer]}> ({
     ^bb0(%arg2: !llvm.ptr):
       omp.terminator
     }) : (!llvm.ptr) -> ()
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index d6f4a810c4a80b..057d54c0c2f7cc 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -91,7 +91,7 @@ func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i
     }) {operandSegmentSizes = array<i32: 1,1,0,0,0,0>} : (i1, i32) -> ()
 
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,1,1,0,0>, proc_bind_val = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,0,0>, proc_bind_kind = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
 
   // test with multiple parameters for single variadic argument
   // CHECK: omp.parallel allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
@@ -184,7 +184,7 @@ func.func @omp_loop_nest(%lb : index, %ub : index, %step : index) -> () {
     "omp.loop_nest" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-    }) {inclusive} : (index, index, index) -> ()
+    }) {loop_inclusive} : (index, index, index) -> ()
     omp.terminator
   }
 
@@ -382,7 +382,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,0,0,0>, ordered_val = 1} :
+  }) {operandSegmentSizes = array<i32: 0,0,0,0,0,0,0>, ordered = 1} :
     () -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(static) {
@@ -392,7 +392,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 1,1,0,0,0,0,0>, schedule_kind = #omp<schedulekind static>} :
     (memref<i32>, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>, %{{.*}} = %{{.*}} : memref<i32>) schedule(static) {
@@ -402,7 +402,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 2,2,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 2,2,0,0,0,0,0>, schedule_kind = #omp<schedulekind static>} :
     (memref<i32>, memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}}) ordered(2) {
@@ -412,7 +412,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,0,1>, schedule_val = #omp<schedulekind dynamic>, ordered_val = 2} :
+  }) {operandSegmentSizes = array<i32: 1,1,0,1,0,0,0>, schedule_kind = #omp<schedulekind dynamic>, ordered = 2} :
     (memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop schedule(auto) nowait {
@@ -422,7 +422,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,0,0,0>, nowait, schedule_val = #omp<schedulekind auto>} :
+  }) {operandSegmentSizes = array<i32: 0,0,0,0,0,0,0>, nowait, schedule_kind = #omp<schedulekind auto>} :
     () -> ()
 
   // CHECK: omp.wsloop {
@@ -574,8 +574,8 @@ func.func @omp_simd_aligned_list(%arg0 : index, %arg1 : index, %arg2 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {alignment_values = [32, 128],
-      operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [32, 128],
+      operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -589,8 +589,8 @@ func.func @omp_simd_aligned_single(%arg0 : index, %arg1 : index, %arg2 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {alignment_values = [32],
-      operandSegmentSizes = array<i32: 1, 0, 0>} : (memref<i32>) -> ()
+  }) {alignments = [32],
+      operandSegmentSizes = array<i32: 1, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -605,7 +605,7 @@ func.func @omp_simd_nontemporal_list(%arg0 : index, %arg1 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 0, 0, 2>} : (memref<i32>, memref<i64>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 2, 0, 0>} : (memref<i32>, memref<i64>) -> ()
   return
 }
 
@@ -620,7 +620,7 @@ func.func @omp_simd_nontemporal_single(%arg0 : index, %arg1 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 0, 0, 1>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -752,8 +752,8 @@ func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>, %arg0 : i3
     }
     omp.terminator
   }
-  // CHECK: omp.distribute dist_schedule_static chunk_size(%{{.+}} : i32)
-  omp.distribute dist_schedule_static chunk_size(%chunk_size : i32) {
+  // CHECK: omp.distribute dist_schedule_static dist_schedule_chunk_size(%{{.+}} : i32)
+  omp.distribute dist_schedule_static dist_schedule_chunk_size(%chunk_size : i32) {
     omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
       omp.yield
     }
@@ -809,7 +809,7 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %devic
     "omp.target"(%if_cond, %device, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0,0,0,0,0>} : ( i1, si32, i32 ) -> ()
+    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0,0,0,0,0,0,0,0>} : ( i1, si32, i32 ) -> ()
 
     // Test with optional map clause.
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
@@ -1294,11 +1294,11 @@ func.func @omp_ordered(%arg1 : i32, %arg2 : i32, %arg3 : i32,
   omp.wsloop ordered(1) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // Only one DEPEND(SINK: vec) clause
-      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}} : i64) {doacross_num_loops = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
 
-      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
-      omp.ordered depend_type(dependsource) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}} : i64) {doacross_num_loops = 1 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
 
       omp.yield
     }
@@ -1308,11 +1308,11 @@ func.func @omp_ordered(%arg1 : i32, %arg2 : i32, %arg3 : i32,
   omp.wsloop ordered(2) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // Multiple DEPEND(SINK: vec) clauses
-      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i64, i64, i64, i64) {num_loops_val = 2 : i64}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0, %vec1, %vec2, %vec3 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i64, i64, i64, i64) {doacross_num_loops = 2 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0, %vec1, %vec2, %vec3 : i64, i64, i64, i64) {doacross_num_loops = 2 : i64}
 
-      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}}, %{{.*}} : i64, i64) {num_loops_val = 2 : i64}
-      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
+      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}}, %{{.*}} : i64, i64) {doacross_num_loops = 2 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {doacross_num_loops = 2 : i64}
 
       omp.yield
     }
@@ -1874,13 +1874,13 @@ func.func @omp_sectionsop(%data_var1 : memref<i32>, %data_var2 : memref<i32>,
   "omp.sections" (%data_var1, %data_var1) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,1,1>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,1,1,0>} : (memref<i32>, memref<i32>) -> ()
 
     // CHECK: omp.sections reduction(@add_f32 -> %{{.*}} : !llvm.ptr)
   "omp.sections" (%redn_var) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,0,0>, reduction_vars_byref = array<i1: false>, reductions=[@add_f32]} : (!llvm.ptr) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0,0>, reduction_byref = array<i1: false>, reduction_syms=[@add_f32]} : (!llvm.ptr) -> ()
 
   // CHECK: omp.sections nowait {
   omp.sections nowait {
@@ -2136,7 +2136,7 @@ func.func @omp_target_depend(%arg0: memref<i32>, %arg1: memref<i32>) {
   omp.target depend(taskdependin -> %arg0 : memref<i32>, taskdependin -> %arg1 : memref<i32>, taskdependinout -> %arg0 : memref<i32>) {
     // CHECK: omp.terminator
     omp.terminator
-  } {operandSegmentSizes = array<i32: 0,0,0,3,0>}
+  } {operandSegmentSizes = array<i32: 0,0,0,3,0,0,0,0>}
   return
 }
 
@@ -2421,8 +2421,8 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
   }
 
   %testi64 = "test.i64"() : () -> (i64)
-  // CHECK: omp.taskloop grain_size(%{{[^:]+}}: i64) {
-  omp.taskloop grain_size(%testi64: i64) {
+  // CHECK: omp.taskloop grainsize(%{{[^:]+}}: i64) {
+  omp.taskloop grainsize(%testi64: i64) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       // CHECK: omp.yield
       omp.yield
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 04c2e9fa091bd7..acebb20674406c 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -148,12 +148,12 @@ llvm.func @test_omp_parallel_num_threads_3() -> () {
 // CHECK: define internal void @[[OMP_OUTLINED_FN_NUM_THREADS_3_1]]
   // CHECK: call void @__kmpc_barrier
 
-// CHECK: define void @test_omp_parallel_if_1(i32 %[[IF_VAR_1:.*]])
+// CHECK: define void @test_omp_parallel_if_1(i32 %[[IF_EXPR_1:.*]])
 llvm.func @test_omp_parallel_if_1(%arg0: i32) -> () {
 
   %0 = llvm.mlir.constant(0 : index) : i32
   %1 = llvm.icmp "slt" %arg0, %0 : i32
-// CHECK: %[[IF_COND_VAR_1:.*]] = icmp slt i32 %[[IF_VAR_1]], 0
+// CHECK: %[[IF_COND_VAR_1:.*]] = icmp slt i32 %[[IF_EXPR_1]], 0
 
 
 // CHECK: %[[GTN_IF_1:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[SI_VAR_IF_1:.*]])
@@ -1330,14 +1330,14 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
       // CHECK: [[TMP2:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR]], i64 0, i64 0
       // CHECK: [[OMP_THREAD2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
       // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB3]], i32 [[OMP_THREAD2]], ptr [[TMP2]])
-      omp.ordered depend_type(dependsink) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%arg3 : i64) {doacross_num_loops = 1 : i64}
 
       // CHECK: [[TMP3:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
       // CHECK: store i64 [[ARG0]], ptr [[TMP3]], align 8
       // CHECK: [[TMP4:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
       // CHECK: [[OMP_THREAD4:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB5:[0-9]+]])
       // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB5]], i32 [[OMP_THREAD4]], ptr [[TMP4]])
-      omp.ordered depend_type(dependsource) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%arg3 : i64) {doacross_num_loops = 1 : i64}
 
       omp.yield
     }
@@ -1360,7 +1360,7 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
       // CHECK: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR7]], i64 0, i64 0
       // CHECK: [[OMP_THREAD8:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7]])
       // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB7]], i32 [[OMP_THREAD8]], ptr [[TMP10]])
-      omp.ordered depend_type(dependsink) depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {doacross_num_loops = 2 : i64}
 
       // CHECK: [[TMP11:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
       // CHECK: store i64 [[ARG0]], ptr [[TMP11]], align 8
@@ -1369,7 +1369,7 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
       // CHECK: [[TMP13:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
       // CHECK: [[OMP_THREAD10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB9:[0-9]+]])
       // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB9]], i32 [[OMP_THREAD10]], ptr [[TMP13]])
-      omp.ordered depend_type(dependsource) depend_vec(%arg3, %arg4 : i64, i64) {num_loops_val = 2 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%arg3, %arg4 : i64, i64) {doacross_num_loops = 2 : i64}
 
       omp.yield
     }
diff --git a/revert_patches.txt b/revert_patches.txt
index a61daa07967d3e..cd61d1e69ac054 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -131,6 +131,7 @@ Ron
 ---
 Revert: breaks devIO for openmp
 dfeb3991fb48 Remove the `x86_mmx` IR type. (#98505)
+b7e4fba6e5dc Cleanup x86_mmx after removing IR type  (#100646) (Reason: dependent on dfeb3991fb48)
 Ron
 ---
 revert : needs downstream rework on MMI removal