diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt
index fc5a07e69e9d62..5db345ecc63f09 100644
--- a/clang-tools-extra/clangd/CMakeLists.txt
+++ b/clang-tools-extra/clangd/CMakeLists.txt
@@ -41,6 +41,7 @@ add_clang_library(clangDaemon
   ClangdServer.cpp
   CodeComplete.cpp
   CodeCompletionStrings.cpp
+  CollectMacros.cpp
   CompileCommands.cpp
   Compiler.cpp
   Context.cpp
diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
index 55e63c71b23eb7..9d93b8592fdcd8 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
@@ -18,6 +18,7 @@
 #include "Trace.h"
 #include "URI.h"
 #include "refactor/Tweak.h"
+#include "clang/Basic/Version.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
@@ -546,7 +547,10 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
           CodeAction::INFO_KIND}}};
 
   llvm::json::Object Result{
-      {{"capabilities",
+      {{"serverInfo",
+        llvm::json::Object{{"name", "clangd"},
+                           {"version", getClangToolFullVersion("clangd")}}},
+       {"capabilities",
         llvm::json::Object{
             {"textDocumentSync", (int)TextDocumentSyncKind::Incremental},
             {"documentFormattingProvider", true},
@@ -600,6 +604,8 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
   Reply(std::move(Result));
 }
 
+void ClangdLSPServer::onInitialized(const InitializedParams &Params) {}
+
 void ClangdLSPServer::onShutdown(const ShutdownParams &Params,
                                  Callback<std::nullptr_t> Reply) {
   // Do essentially nothing, just say we're ready to exit.
@@ -808,7 +814,9 @@ void ClangdLSPServer::onDocumentDidClose(
   // VSCode). Note that this cannot race with actual diagnostics responses
   // because removeDocument() guarantees no diagnostic callbacks will be
   // executed after it returns.
-  publishDiagnostics(URIForFile::canonicalize(File, /*TUPath=*/File), {});
+  PublishDiagnosticsParams Notification;
+  Notification.uri = URIForFile::canonicalize(File, /*TUPath=*/File);
+  publishDiagnostics(Notification);
 }
 
 void ClangdLSPServer::onDocumentOnTypeFormatting(
@@ -1145,18 +1153,13 @@ void ClangdLSPServer::applyConfiguration(
 }
 
 void ClangdLSPServer::publishSemanticHighlighting(
-    SemanticHighlightingParams Params) {
+    const SemanticHighlightingParams &Params) {
   notify("textDocument/semanticHighlighting", Params);
 }
 
 void ClangdLSPServer::publishDiagnostics(
-    const URIForFile &File, std::vector<clangd::Diagnostic> Diagnostics) {
-  // Publish diagnostics.
-  notify("textDocument/publishDiagnostics",
-         llvm::json::Object{
-             {"uri", File},
-             {"diagnostics", std::move(Diagnostics)},
-         });
+    const PublishDiagnosticsParams &Params) {
+  notify("textDocument/publishDiagnostics", Params);
 }
 
 // FIXME: This function needs to be properly tested.
@@ -1243,6 +1246,7 @@ ClangdLSPServer::ClangdLSPServer(
       NegotiatedOffsetEncoding(ForcedOffsetEncoding) {
   // clang-format off
   MsgHandler->bind("initialize", &ClangdLSPServer::onInitialize);
+  MsgHandler->bind("initialized", &ClangdLSPServer::onInitialized);
   MsgHandler->bind("shutdown", &ClangdLSPServer::onShutdown);
   MsgHandler->bind("sync", &ClangdLSPServer::onSync);
   MsgHandler->bind("textDocument/rangeFormatting", &ClangdLSPServer::onDocumentRangeFormatting);
@@ -1361,15 +1365,15 @@ void ClangdLSPServer::onHighlightingsReady(
 
 void ClangdLSPServer::onDiagnosticsReady(PathRef File,
                                          std::vector<Diag> Diagnostics) {
-  auto URI = URIForFile::canonicalize(File, /*TUPath=*/File);
-  std::vector<Diagnostic> LSPDiagnostics;
+  PublishDiagnosticsParams Notification;
+  Notification.uri = URIForFile::canonicalize(File, /*TUPath=*/File);
   DiagnosticToReplacementMap LocalFixIts; // Temporary storage
   for (auto &Diag : Diagnostics) {
-    toLSPDiags(Diag, URI, DiagOpts,
+    toLSPDiags(Diag, Notification.uri, DiagOpts,
                [&](clangd::Diagnostic Diag, llvm::ArrayRef<Fix> Fixes) {
                  auto &FixItsForDiagnostic = LocalFixIts[Diag];
                  llvm::copy(Fixes, std::back_inserter(FixItsForDiagnostic));
-                 LSPDiagnostics.push_back(std::move(Diag));
+                 Notification.diagnostics.push_back(std::move(Diag));
                });
   }
 
@@ -1380,7 +1384,7 @@ void ClangdLSPServer::onDiagnosticsReady(PathRef File,
   }
 
   // Send a notification to the LSP client.
-  publishDiagnostics(URI, std::move(LSPDiagnostics));
+  publishDiagnostics(Notification);
 }
 
 void ClangdLSPServer::onBackgroundIndexProgress(
diff --git a/clang-tools-extra/clangd/ClangdLSPServer.h b/clang-tools-extra/clangd/ClangdLSPServer.h
index f30fbf6b51492a..4ab0354ead72a3 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.h
+++ b/clang-tools-extra/clangd/ClangdLSPServer.h
@@ -67,6 +67,7 @@ class ClangdLSPServer : private ClangdServer::Callbacks {
   // LSP methods. Notifications have signature void(const Params&).
   // Calls have signature void(const Params&, Callback<Response>).
   void onInitialize(const InitializeParams &, Callback<llvm::json::Value>);
+  void onInitialized(const InitializedParams &);
   void onShutdown(const ShutdownParams &, Callback<std::nullptr_t>);
   void onSync(const NoParams &, Callback<std::nullptr_t>);
   void onDocumentDidOpen(const DidOpenTextDocumentParams &);
@@ -132,11 +133,10 @@ class ClangdLSPServer : private ClangdServer::Callbacks {
   void applyConfiguration(const ConfigurationSettings &Settings);
 
   /// Sends a "publishSemanticHighlighting" notification to the LSP client.
-  void publishSemanticHighlighting(SemanticHighlightingParams Params);
+  void publishSemanticHighlighting(const SemanticHighlightingParams &);
 
   /// Sends a "publishDiagnostics" notification to the LSP client.
-  void publishDiagnostics(const URIForFile &File,
-                          std::vector<clangd::Diagnostic> Diagnostics);
+  void publishDiagnostics(const PublishDiagnosticsParams &);
 
   /// Since initialization of CDBs and ClangdServer is done lazily, the
   /// following context captures the one used while creating ClangdLSPServer and
diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index f1a88902c8c0e5..5dd00322a5abf8 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -194,10 +194,6 @@ void ClangdServer::addDocument(PathRef File, llvm::StringRef Contents,
 
 void ClangdServer::removeDocument(PathRef File) { WorkScheduler.remove(File); }
 
-llvm::StringRef ClangdServer::getDocument(PathRef File) const {
-  return WorkScheduler.getContents(File);
-}
-
 void ClangdServer::codeComplete(PathRef File, Position Pos,
                                 const clangd::CodeCompleteOptions &Opts,
                                 Callback<CodeCompleteResult> CB) {
diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
index e9f2c30b174955..d098f6242f72c2 100644
--- a/clang-tools-extra/clangd/ClangdServer.h
+++ b/clang-tools-extra/clangd/ClangdServer.h
@@ -175,9 +175,6 @@ class ClangdServer {
                    WantDiagnostics WD = WantDiagnostics::Auto,
                    bool ForceRebuild = false);
 
-  /// Get the contents of \p File, which should have been added.
-  llvm::StringRef getDocument(PathRef File) const;
-
   /// Remove \p File from list of tracked files, schedule a request to free
   /// resources associated with it. Pending diagnostics for closed files may not
   /// be delivered, even if requested with WantDiags::Auto or WantDiags::Yes.
diff --git a/clang-tools-extra/clangd/CollectMacros.cpp b/clang-tools-extra/clangd/CollectMacros.cpp
new file mode 100644
index 00000000000000..ea7dd18ee13036
--- /dev/null
+++ b/clang-tools-extra/clangd/CollectMacros.cpp
@@ -0,0 +1,34 @@
+//===--- CollectMacros.cpp ---------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CollectMacros.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Lex/Lexer.h"
+
+namespace clang {
+namespace clangd {
+
+void CollectMainFileMacros::add(const Token &MacroNameTok,
+                                const MacroInfo *MI) {
+  if (!InMainFile)
+    return;
+  auto Loc = MacroNameTok.getLocation();
+  if (Loc.isInvalid() || Loc.isMacroID())
+    return;
+
+  auto Name = MacroNameTok.getIdentifierInfo()->getName();
+  Out.Names.insert(Name);
+  auto Range = halfOpenToRange(
+      SM, CharSourceRange::getCharRange(Loc, MacroNameTok.getEndLoc()));
+  if (auto SID = getSymbolID(Name, MI, SM))
+    Out.MacroRefs[*SID].push_back(Range);
+  else
+    Out.UnknownMacros.push_back(Range);
+}
+} // namespace clangd
+} // namespace clang
diff --git a/clang-tools-extra/clangd/CollectMacros.h b/clang-tools-extra/clangd/CollectMacros.h
index 5c3fca10ad4a5c..eecea0455be270 100644
--- a/clang-tools-extra/clangd/CollectMacros.h
+++ b/clang-tools-extra/clangd/CollectMacros.h
@@ -40,10 +40,8 @@ struct MainFileMacros {
 ///  - collect macros after the preamble of the main file (in ParsedAST.cpp)
 class CollectMainFileMacros : public PPCallbacks {
 public:
-  explicit CollectMainFileMacros(const SourceManager &SM,
-                                 const LangOptions &LangOpts,
-                                 MainFileMacros &Out)
-      : SM(SM), LangOpts(LangOpts), Out(Out) {}
+  explicit CollectMainFileMacros(const SourceManager &SM, MainFileMacros &Out)
+      : SM(SM), Out(Out) {}
 
   void FileChanged(SourceLocation Loc, FileChangeReason,
                    SrcMgr::CharacteristicKind, FileID) override {
@@ -89,24 +87,8 @@ class CollectMainFileMacros : public PPCallbacks {
   }
 
 private:
-  void add(const Token &MacroNameTok, const MacroInfo *MI) {
-    if (!InMainFile)
-      return;
-    auto Loc = MacroNameTok.getLocation();
-    if (Loc.isMacroID())
-      return;
-
-    if (auto Range = getTokenRange(SM, LangOpts, Loc)) {
-      auto Name = MacroNameTok.getIdentifierInfo()->getName();
-      Out.Names.insert(Name);
-      if (auto SID = getSymbolID(Name, MI, SM))
-        Out.MacroRefs[*SID].push_back(*Range);
-      else
-        Out.UnknownMacros.push_back(*Range);
-    }
-  }
+  void add(const Token &MacroNameTok, const MacroInfo *MI);
   const SourceManager &SM;
-  const LangOptions &LangOpts;
   bool InMainFile = true;
   MainFileMacros &Out;
 };
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index 5796657a5800be..5c1288c14b5860 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -26,6 +26,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/PrettyPrinter.h"
 #include "clang/AST/Type.h"
+#include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Index/IndexSymbol.h"
@@ -530,32 +531,33 @@ llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
     llvm::consumeError(CurLoc.takeError());
     return llvm::None;
   }
-  auto TokensTouchingCursor =
-      syntax::spelledTokensTouching(*CurLoc, AST.getTokens());
+  const auto &TB = AST.getTokens();
+  auto TokensTouchingCursor = syntax::spelledTokensTouching(*CurLoc, TB);
   // Early exit if there were no tokens around the cursor.
   if (TokensTouchingCursor.empty())
     return llvm::None;
 
-  // To be used as a backup for highlighting the selected token.
-  SourceLocation IdentLoc;
+  // To be used as a backup for highlighting the selected token, we use back as
+  // it aligns better with biases elsewhere (editors tend to send the position
+  // for the left of the hovered token).
+  CharSourceRange HighlightRange =
+      TokensTouchingCursor.back().range(SM).toCharRange(SM);
   llvm::Optional<HoverInfo> HI;
   // Macros and deducedtype only works on identifiers and auto/decltype keywords
   // respectively. Therefore they are only trggered on whichever works for them,
   // similar to SelectionTree::create().
   for (const auto &Tok : TokensTouchingCursor) {
     if (Tok.kind() == tok::identifier) {
-      IdentLoc = Tok.location();
+      // Prefer the identifier token as a fallback highlighting range.
+      HighlightRange = Tok.range(SM).toCharRange(SM);
       if (auto M = locateMacroAt(Tok, AST.getPreprocessor())) {
         HI = getHoverContents(*M, AST);
-        HI->SymRange = getTokenRange(AST.getSourceManager(), AST.getLangOpts(),
-                                     Tok.location());
         break;
       }
     } else if (Tok.kind() == tok::kw_auto || Tok.kind() == tok::kw_decltype) {
       if (auto Deduced = getDeducedType(AST.getASTContext(), Tok.location())) {
         HI = getHoverContents(*Deduced, AST.getASTContext(), Index);
-        HI->SymRange = getTokenRange(AST.getSourceManager(), AST.getLangOpts(),
-                                     Tok.location());
+        HighlightRange = Tok.range(SM).toCharRange(SM);
         break;
       }
     }
@@ -566,10 +568,11 @@ llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
     auto Offset = SM.getFileOffset(*CurLoc);
     // Editors send the position on the left of the hovered character.
     // So our selection tree should be biased right. (Tested with VSCode).
-    SelectionTree ST = SelectionTree::createRight(
-        AST.getASTContext(), AST.getTokens(), Offset, Offset);
+    SelectionTree ST =
+        SelectionTree::createRight(AST.getASTContext(), TB, Offset, Offset);
     std::vector<const Decl *> Result;
     if (const SelectionTree::Node *N = ST.commonAncestor()) {
+      // FIXME: Fill in HighlightRange with range coming from N->ASTNode.
       auto Decls = explicitReferenceTargets(N->ASTNode, DeclRelation::Alias);
       if (!Decls.empty()) {
         HI = getHoverContents(Decls.front(), Index);
@@ -592,14 +595,7 @@ llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
   if (auto Formatted =
           tooling::applyAllReplacements(HI->Definition, Replacements))
     HI->Definition = *Formatted;
-  // FIXME: We should rather fill this with info coming from SelectionTree node.
-  if (!HI->SymRange) {
-    SourceLocation ToHighlight = TokensTouchingCursor.front().location();
-    if (IdentLoc.isValid())
-      ToHighlight = IdentLoc;
-    HI->SymRange =
-        getTokenRange(AST.getSourceManager(), AST.getLangOpts(), ToHighlight);
-  }
+  HI->SymRange = halfOpenToRange(SM, HighlightRange);
 
   return HI;
 }
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 36a9c47f7a9d2e..e43c2ce662616c 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -350,7 +350,7 @@ ParsedAST::build(std::unique_ptr<clang::CompilerInvocation> CI,
     Macros = Preamble->Macros;
   Clang->getPreprocessor().addPPCallbacks(
       std::make_unique<CollectMainFileMacros>(Clang->getSourceManager(),
-                                              Clang->getLangOpts(), Macros));
+                                              Macros));
 
   // Copy over the includes from the preamble, then combine with the
   // non-preamble includes below.
diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index eca545fd09e4b3..f2b6b017f10f58 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -54,7 +54,7 @@ class CppFilePreambleCallbacks : public PreambleCallbacks {
 
     return std::make_unique<PPChainedCallbacks>(
         collectIncludeStructureCallback(*SourceMgr, &Includes),
-        std::make_unique<CollectMainFileMacros>(*SourceMgr, *LangOpts, Macros));
+        std::make_unique<CollectMainFileMacros>(*SourceMgr, Macros));
   }
 
   CommentHandler *getCommentHandler() override {
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index 8e89c1f45f3a53..5a867c52c1ed26 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -531,6 +531,13 @@ bool fromJSON(const llvm::json::Value &Params, Diagnostic &R) {
   return true;
 }
 
+llvm::json::Value toJSON(const PublishDiagnosticsParams &PDP) {
+  return llvm::json::Object{
+    {"uri", PDP.uri},
+    {"diagnostics", PDP.diagnostics},
+  };
+}
+
 bool fromJSON(const llvm::json::Value &Params, CodeActionContext &R) {
   llvm::json::ObjectMapper O(Params);
   return O && O.map("diagnostics", R.diagnostics);
diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h
index a376e5f39e795c..596c7e9004e79d 100644
--- a/clang-tools-extra/clangd/Protocol.h
+++ b/clang-tools-extra/clangd/Protocol.h
@@ -239,6 +239,7 @@ bool fromJSON(const llvm::json::Value &E, TraceLevel &Out);
 
 struct NoParams {};
 inline bool fromJSON(const llvm::json::Value &, NoParams &) { return true; }
+using InitializedParams = NoParams;
 using ShutdownParams = NoParams;
 using ExitParams = NoParams;
 
@@ -791,6 +792,14 @@ struct LSPDiagnosticCompare {
 bool fromJSON(const llvm::json::Value &, Diagnostic &);
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Diagnostic &);
 
+struct PublishDiagnosticsParams {
+  /// The URI for which diagnostic information is reported.
+  URIForFile uri;
+  /// An array of diagnostic information items.
+  std::vector<Diagnostic> diagnostics;
+};
+llvm::json::Value toJSON(const PublishDiagnosticsParams &);
+
 struct CodeActionContext {
   /// An array of diagnostics.
   std::vector<Diagnostic> diagnostics;
diff --git a/clang-tools-extra/clangd/SemanticHighlighting.cpp b/clang-tools-extra/clangd/SemanticHighlighting.cpp
index e7b1618fd2d4fb..d5c51ebff5e1e1 100644
--- a/clang-tools-extra/clangd/SemanticHighlighting.cpp
+++ b/clang-tools-extra/clangd/SemanticHighlighting.cpp
@@ -23,6 +23,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Tooling/Syntax/Tokens.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
@@ -128,40 +129,39 @@ llvm::Optional<HighlightingKind> kindForReference(const ReferenceLoc &R) {
   return Result;
 }
 
+// For a macro usage `DUMP(foo)`, we want:
+//  - DUMP --> "macro"
+//  - foo --> "variable".
+SourceLocation getHighlightableSpellingToken(SourceLocation L,
+                                             const SourceManager &SM) {
+  if (L.isFileID())
+    return SM.isWrittenInMainFile(L) ? L : SourceLocation{};
+  // Tokens expanded from the macro body contribute no highlightings.
+  if (!SM.isMacroArgExpansion(L))
+    return {};
+  // Tokens expanded from macro args are potentially highlightable.
+  return getHighlightableSpellingToken(SM.getImmediateSpellingLoc(L), SM);
+}
+
 /// Consumes source locations and maps them to text ranges for highlightings.
 class HighlightingsBuilder {
 public:
-  HighlightingsBuilder(const SourceManager &SourceMgr,
-                       const LangOptions &LangOpts)
-      : SourceMgr(SourceMgr), LangOpts(LangOpts) {}
+  HighlightingsBuilder(const ParsedAST &AST)
+      : TB(AST.getTokens()), SourceMgr(AST.getSourceManager()),
+        LangOpts(AST.getLangOpts()) {}
 
   void addToken(HighlightingToken T) { Tokens.push_back(T); }
 
   void addToken(SourceLocation Loc, HighlightingKind Kind) {
+    Loc = getHighlightableSpellingToken(Loc, SourceMgr);
     if (Loc.isInvalid())
       return;
-    if (Loc.isMacroID()) {
-      // Only intereseted in highlighting arguments in macros (DEF_X(arg)).
-      if (!SourceMgr.isMacroArgExpansion(Loc))
-        return;
-      Loc = SourceMgr.getSpellingLoc(Loc);
-    }
-
-    // Non top level decls that are included from a header are not filtered by
-    // topLevelDecls. (example: method declarations being included from
-    // another file for a class from another file).
-    // There are also cases with macros where the spelling loc will not be in
-    // the main file and the highlighting would be incorrect.
-    if (!isInsideMainFile(Loc, SourceMgr))
-      return;
+    const auto *Tok = TB.spelledTokenAt(Loc);
+    assert(Tok);
 
-    auto Range = getTokenRange(SourceMgr, LangOpts, Loc);
-    if (!Range) {
-      // R should always have a value, if it doesn't something is very wrong.
-      elog("Tried to add semantic token with an invalid range");
-      return;
-    }
-    Tokens.push_back(HighlightingToken{Kind, *Range});
+    auto Range = halfOpenToRange(SourceMgr,
+                                 Tok->range(SourceMgr).toCharRange(SourceMgr));
+    Tokens.push_back(HighlightingToken{Kind, std::move(Range)});
   }
 
   std::vector<HighlightingToken> collect(ParsedAST &AST) && {
@@ -211,6 +211,7 @@ class HighlightingsBuilder {
   }
 
 private:
+  const syntax::TokenBuffer &TB;
   const SourceManager &SourceMgr;
   const LangOptions &LangOpts;
   std::vector<HighlightingToken> Tokens;
@@ -311,7 +312,7 @@ takeLine(ArrayRef<HighlightingToken> AllTokens,
 std::vector<HighlightingToken> getSemanticHighlightings(ParsedAST &AST) {
   auto &C = AST.getASTContext();
   // Add highlightings for AST nodes.
-  HighlightingsBuilder Builder(AST.getSourceManager(), C.getLangOpts());
+  HighlightingsBuilder Builder(AST);
   // Highlight 'decltype' and 'auto' as their underlying types.
   CollectExtraHighlightings(Builder).TraverseAST(C);
   // Highlight all decls and references coming from the AST.
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index 79d027def4bc35..d18daa910d18ee 100644
--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -225,17 +225,6 @@ bool isSpelledInSource(SourceLocation Loc, const SourceManager &SM) {
   return true;
 }
 
-llvm::Optional<Range> getTokenRange(const SourceManager &SM,
-                                    const LangOptions &LangOpts,
-                                    SourceLocation TokLoc) {
-  if (!TokLoc.isValid())
-    return llvm::None;
-  SourceLocation End = Lexer::getLocForEndOfToken(TokLoc, 0, SM, LangOpts);
-  if (!End.isValid())
-    return llvm::None;
-  return halfOpenToRange(SM, CharSourceRange::getCharRange(TokLoc, End));
-}
-
 bool isValidFileRange(const SourceManager &Mgr, SourceRange R) {
   if (!R.getBegin().isValid() || !R.getEnd().isValid())
     return false;
@@ -645,8 +634,7 @@ std::vector<Range> collectIdentifierRanges(llvm::StringRef Identifier,
       [&](const syntax::Token &Tok, const SourceManager &SM) {
         if (Tok.kind() != tok::identifier || Tok.text(SM) != Identifier)
           return;
-        if (auto Range = getTokenRange(SM, LangOpts, Tok.location()))
-          Ranges.push_back(*Range);
+        Ranges.push_back(halfOpenToRange(SM, Tok.range(SM).toCharRange(SM)));
       });
   return Ranges;
 }
diff --git a/clang-tools-extra/clangd/SourceCode.h b/clang-tools-extra/clangd/SourceCode.h
index c601cc89df2809..383c57371b0059 100644
--- a/clang-tools-extra/clangd/SourceCode.h
+++ b/clang-tools-extra/clangd/SourceCode.h
@@ -69,11 +69,6 @@ Position offsetToPosition(llvm::StringRef Code, size_t Offset);
 /// FIXME: This should return an error if the location is invalid.
 Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc);
 
-/// Returns the taken range at \p TokLoc.
-llvm::Optional<Range> getTokenRange(const SourceManager &SM,
-                                    const LangOptions &LangOpts,
-                                    SourceLocation TokLoc);
-
 /// Return the file location, corresponding to \p P. Note that one should take
 /// care to avoid comparing the result with expansion locations.
 llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM,
diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp
index f59c19e8031ee9..3f3162a33c303a 100644
--- a/clang-tools-extra/clangd/TUScheduler.cpp
+++ b/clang-tools-extra/clangd/TUScheduler.cpp
@@ -921,15 +921,6 @@ void TUScheduler::remove(PathRef File) {
          File);
 }
 
-llvm::StringRef TUScheduler::getContents(PathRef File) const {
-  auto It = Files.find(File);
-  if (It == Files.end()) {
-    elog("getContents() for untracked file: {0}", File);
-    return "";
-  }
-  return It->second->Contents;
-}
-
 llvm::StringMap<std::string> TUScheduler::getAllFileContents() const {
   llvm::StringMap<std::string> Results;
   for (auto &It : Files)
diff --git a/clang-tools-extra/clangd/TUScheduler.h b/clang-tools-extra/clangd/TUScheduler.h
index 5082612b0ccc36..948fde7ed1099c 100644
--- a/clang-tools-extra/clangd/TUScheduler.h
+++ b/clang-tools-extra/clangd/TUScheduler.h
@@ -213,10 +213,6 @@ class TUScheduler {
   /// if requested with WantDiags::Auto or WantDiags::Yes.
   void remove(PathRef File);
 
-  /// Returns the current contents of the buffer for File, per last update().
-  /// The returned StringRef may be invalidated by any write to TUScheduler.
-  llvm::StringRef getContents(PathRef File) const;
-
   /// Returns a snapshot of all file buffer contents, per last update().
   llvm::StringMap<std::string> getAllFileContents() const;
 
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 29c2338f5bb5a6..67f7bda6a5e65b 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -29,6 +29,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Index/IndexDataConsumer.h"
@@ -149,108 +150,78 @@ std::vector<const NamedDecl *> getDeclAtPosition(ParsedAST &AST,
   return Result;
 }
 
-llvm::Optional<Location> makeLocation(ASTContext &AST, SourceLocation TokLoc,
+// Expects Loc to be a SpellingLocation, will bail out otherwise as it can't
+// figure out a filename.
+llvm::Optional<Location> makeLocation(const ASTContext &AST, SourceLocation Loc,
                                       llvm::StringRef TUPath) {
-  const SourceManager &SourceMgr = AST.getSourceManager();
-  const FileEntry *F = SourceMgr.getFileEntryForID(SourceMgr.getFileID(TokLoc));
+  const auto &SM = AST.getSourceManager();
+  const FileEntry *F = SM.getFileEntryForID(SM.getFileID(Loc));
   if (!F)
     return None;
-  auto FilePath = getCanonicalPath(F, SourceMgr);
+  auto FilePath = getCanonicalPath(F, SM);
   if (!FilePath) {
     log("failed to get path!");
     return None;
   }
-  if (auto Range =
-          getTokenRange(AST.getSourceManager(), AST.getLangOpts(), TokLoc)) {
-    Location L;
-    L.uri = URIForFile::canonicalize(*FilePath, TUPath);
-    L.range = *Range;
-    return L;
-  }
-  return None;
+  Location L;
+  L.uri = URIForFile::canonicalize(*FilePath, TUPath);
+  // We call MeasureTokenLength here as TokenBuffer doesn't store spelled tokens
+  // outside the main file.
+  auto TokLen = Lexer::MeasureTokenLength(Loc, SM, AST.getLangOpts());
+  L.range = halfOpenToRange(
+      SM, CharSourceRange::getCharRange(Loc, Loc.getLocWithOffset(TokLen)));
+  return L;
 }
 
 } // namespace
 
-std::vector<DocumentLink> getDocumentLinks(ParsedAST &AST) {
-  const auto &SM = AST.getSourceManager();
-  auto MainFilePath =
-      getCanonicalPath(SM.getFileEntryForID(SM.getMainFileID()), SM);
-  if (!MainFilePath) {
-    elog("Failed to get a path for the main file, so no links");
-    return {};
-  }
-
-  std::vector<DocumentLink> Result;
-  for (auto &Inc : AST.getIncludeStructure().MainFileIncludes) {
-    if (!Inc.Resolved.empty()) {
-      Result.push_back(DocumentLink(
-          {Inc.R, URIForFile::canonicalize(Inc.Resolved, *MainFilePath)}));
-    }
-  }
-
-  return Result;
-}
-
-std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
-                                          const SymbolIndex *Index) {
-  const auto &SM = AST.getSourceManager();
-  auto MainFilePath =
-      getCanonicalPath(SM.getFileEntryForID(SM.getMainFileID()), SM);
-  if (!MainFilePath) {
-    elog("Failed to get a path for the main file, so no references");
-    return {};
-  }
-
-  // Treat #included files as symbols, to enable go-to-definition on them.
+// Treat #included files as symbols, to enable go-to-definition on them.
+static llvm::Optional<LocatedSymbol>
+locateFileReferent(const Position &Pos, ParsedAST &AST,
+                   llvm::StringRef MainFilePath) {
   for (auto &Inc : AST.getIncludeStructure().MainFileIncludes) {
     if (!Inc.Resolved.empty() && Inc.R.start.line == Pos.line) {
       LocatedSymbol File;
       File.Name = std::string(llvm::sys::path::filename(Inc.Resolved));
       File.PreferredDeclaration = {
-          URIForFile::canonicalize(Inc.Resolved, *MainFilePath), Range{}};
+          URIForFile::canonicalize(Inc.Resolved, MainFilePath), Range{}};
       File.Definition = File.PreferredDeclaration;
       // We're not going to find any further symbols on #include lines.
-      return {std::move(File)};
+      return File;
     }
   }
+  return llvm::None;
+}
 
-  auto CurLoc = sourceLocationInMainFile(SM, Pos);
-  if (!CurLoc) {
-    elog("locateSymbolAt failed to convert position to source location: {0}",
-         CurLoc.takeError());
-    return {};
-  }
-
-  // Macros are simple: there's no declaration/definition distinction.
-  // As a consequence, there's no need to look them up in the index either.
-  std::vector<LocatedSymbol> Result;
-  const auto *TouchedIdentifier =
-      syntax::spelledIdentifierTouching(*CurLoc, AST.getTokens());
-  if (TouchedIdentifier) {
-    if (auto M = locateMacroAt(*TouchedIdentifier, AST.getPreprocessor())) {
-      if (auto Loc = makeLocation(AST.getASTContext(),
-                                  M->Info->getDefinitionLoc(), *MainFilePath)) {
-        LocatedSymbol Macro;
-        Macro.Name = std::string(M->Name);
-        Macro.PreferredDeclaration = *Loc;
-        Macro.Definition = Loc;
-        Result.push_back(std::move(Macro));
-
-        // Don't look at the AST or index if we have a macro result.
-        // (We'd just return declarations referenced from the macro's
-        // expansion.)
-        return Result;
-      }
+// Macros are simple: there's no declaration/definition distinction.
+// As a consequence, there's no need to look them up in the index either.
+static llvm::Optional<LocatedSymbol>
+locateMacroReferent(const syntax::Token &TouchedIdentifier, ParsedAST &AST,
+                    llvm::StringRef MainFilePath) {
+  if (auto M = locateMacroAt(TouchedIdentifier, AST.getPreprocessor())) {
+    if (auto Loc = makeLocation(AST.getASTContext(),
+                                M->Info->getDefinitionLoc(), MainFilePath)) {
+      LocatedSymbol Macro;
+      Macro.Name = std::string(M->Name);
+      Macro.PreferredDeclaration = *Loc;
+      Macro.Definition = Loc;
+      return Macro;
     }
   }
+  return llvm::None;
+}
 
-  // Decls are more complicated.
-  // The AST contains at least a declaration, maybe a definition.
-  // These are up-to-date, and so generally preferred over index results.
-  // We perform a single batch index lookup to find additional definitions.
-
+// Decls are more complicated.
+// The AST contains at least a declaration, maybe a definition.
+// These are up-to-date, and so generally preferred over index results.
+// We perform a single batch index lookup to find additional definitions.
+static std::vector<LocatedSymbol>
+locateASTReferent(SourceLocation CurLoc, const syntax::Token *TouchedIdentifier,
+                  ParsedAST &AST, llvm::StringRef MainFilePath,
+                  const SymbolIndex *Index) {
+  const SourceManager &SM = AST.getSourceManager();
   // Results follow the order of Symbols.Decls.
+  std::vector<LocatedSymbol> Result;
   // Keep track of SymbolID -> index mapping, to fill in index data later.
   llvm::DenseMap<SymbolID, size_t> ResultIndex;
 
@@ -259,7 +230,7 @@ std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
     const NamedDecl *Preferred = Def ? Def : D;
 
     auto Loc = makeLocation(AST.getASTContext(), nameLocation(*Preferred, SM),
-                            *MainFilePath);
+                            MainFilePath);
     if (!Loc)
       return;
 
@@ -278,7 +249,7 @@ std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
   // Emit all symbol locations (declaration or definition) from AST.
   DeclRelationSet Relations =
       DeclRelation::TemplatePattern | DeclRelation::Alias;
-  for (const NamedDecl *D : getDeclAtPosition(AST, *CurLoc, Relations)) {
+  for (const NamedDecl *D : getDeclAtPosition(AST, CurLoc, Relations)) {
     // Special case: void foo() ^override: jump to the overridden method.
     if (const auto *CMD = llvm::dyn_cast<CXXMethodDecl>(D)) {
       const InheritableAttr *Attr = D->getAttr<OverrideAttr>();
@@ -320,23 +291,23 @@ std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
       if (R.Definition) { // from AST
         // Special case: if the AST yielded a definition, then it may not be
         // the right *declaration*. Prefer the one from the index.
-        if (auto Loc = toLSPLocation(Sym.CanonicalDeclaration, *MainFilePath))
+        if (auto Loc = toLSPLocation(Sym.CanonicalDeclaration, MainFilePath))
           R.PreferredDeclaration = *Loc;
 
         // We might still prefer the definition from the index, e.g. for
         // generated symbols.
         if (auto Loc = toLSPLocation(
                 getPreferredLocation(*R.Definition, Sym.Definition, Scratch),
-                *MainFilePath))
+                MainFilePath))
           R.Definition = *Loc;
       } else {
-        R.Definition = toLSPLocation(Sym.Definition, *MainFilePath);
+        R.Definition = toLSPLocation(Sym.Definition, MainFilePath);
 
         // Use merge logic to choose AST or index declaration.
         if (auto Loc = toLSPLocation(
                 getPreferredLocation(R.PreferredDeclaration,
                                      Sym.CanonicalDeclaration, Scratch),
-                *MainFilePath))
+                MainFilePath))
           R.PreferredDeclaration = *Loc;
       }
     });
@@ -345,17 +316,75 @@ std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
   return Result;
 }
 
+std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
+                                          const SymbolIndex *Index) {
+  const auto &SM = AST.getSourceManager();
+  auto MainFilePath =
+      getCanonicalPath(SM.getFileEntryForID(SM.getMainFileID()), SM);
+  if (!MainFilePath) {
+    elog("Failed to get a path for the main file, so no references");
+    return {};
+  }
+
+  if (auto File = locateFileReferent(Pos, AST, *MainFilePath))
+    return {std::move(*File)};
+
+  auto CurLoc = sourceLocationInMainFile(SM, Pos);
+  if (!CurLoc) {
+    elog("locateSymbolAt failed to convert position to source location: {0}",
+         CurLoc.takeError());
+    return {};
+  }
+
+  const syntax::Token *TouchedIdentifier =
+      syntax::spelledIdentifierTouching(*CurLoc, AST.getTokens());
+  if (TouchedIdentifier)
+    if (auto Macro =
+            locateMacroReferent(*TouchedIdentifier, AST, *MainFilePath))
+      // Don't look at the AST or index if we have a macro result.
+      // (We'd just return declarations referenced from the macro's
+      // expansion.)
+      return {*std::move(Macro)};
+
+  return locateASTReferent(*CurLoc, TouchedIdentifier, AST, *MainFilePath,
+                           Index);
+}
+
+std::vector<DocumentLink> getDocumentLinks(ParsedAST &AST) {
+  const auto &SM = AST.getSourceManager();
+  auto MainFilePath =
+      getCanonicalPath(SM.getFileEntryForID(SM.getMainFileID()), SM);
+  if (!MainFilePath) {
+    elog("Failed to get a path for the main file, so no links");
+    return {};
+  }
+
+  std::vector<DocumentLink> Result;
+  for (auto &Inc : AST.getIncludeStructure().MainFileIncludes) {
+    if (!Inc.Resolved.empty()) {
+      Result.push_back(DocumentLink(
+          {Inc.R, URIForFile::canonicalize(Inc.Resolved, *MainFilePath)}));
+    }
+  }
+
+  return Result;
+}
+
 namespace {
 
 /// Collects references to symbols within the main file.
 class ReferenceFinder : public index::IndexDataConsumer {
 public:
   struct Reference {
-    SourceLocation Loc;
+    syntax::Token SpelledTok;
     index::SymbolRoleSet Role;
+
+    Range range(const SourceManager &SM) const {
+      return halfOpenToRange(SM, SpelledTok.range(SM).toCharRange(SM));
+    }
   };
 
-  ReferenceFinder(ASTContext &AST, Preprocessor &PP,
+  ReferenceFinder(const ParsedAST &AST,
                   const std::vector<const NamedDecl *> &TargetDecls)
       : AST(AST) {
     for (const NamedDecl *D : TargetDecls)
@@ -364,13 +393,17 @@ class ReferenceFinder : public index::IndexDataConsumer {
 
   std::vector<Reference> take() && {
     llvm::sort(References, [](const Reference &L, const Reference &R) {
-      return std::tie(L.Loc, L.Role) < std::tie(R.Loc, R.Role);
+      auto LTok = L.SpelledTok.location();
+      auto RTok = R.SpelledTok.location();
+      return std::tie(LTok, L.Role) < std::tie(RTok, R.Role);
     });
     // We sometimes see duplicates when parts of the AST get traversed twice.
     References.erase(std::unique(References.begin(), References.end(),
                                  [](const Reference &L, const Reference &R) {
-                                   return std::tie(L.Loc, L.Role) ==
-                                          std::tie(R.Loc, R.Role);
+                                   auto LTok = L.SpelledTok.location();
+                                   auto RTok = R.SpelledTok.location();
+                                   return std::tie(LTok, L.Role) ==
+                                          std::tie(RTok, R.Role);
                                  }),
                      References.end());
     return std::move(References);
@@ -382,22 +415,27 @@ class ReferenceFinder : public index::IndexDataConsumer {
                        SourceLocation Loc,
                        index::IndexDataConsumer::ASTNodeInfo ASTNode) override {
     assert(D->isCanonicalDecl() && "expect D to be a canonical declaration");
+    if (!CanonicalTargets.count(D))
+      return true;
+    const auto &TB = AST.getTokens();
     const SourceManager &SM = AST.getSourceManager();
     Loc = SM.getFileLoc(Loc);
-    if (isInsideMainFile(Loc, SM) && CanonicalTargets.count(D))
-      References.push_back({Loc, Roles});
+    // We are only traversing decls *inside* the main file, so this should hold.
+    assert(isInsideMainFile(Loc, SM));
+    if (const auto *Tok = TB.spelledTokenAt(Loc))
+      References.push_back({*Tok, Roles});
     return true;
   }
 
 private:
   llvm::SmallSet<const Decl *, 4> CanonicalTargets;
   std::vector<Reference> References;
-  const ASTContext &AST;
+  const ParsedAST &AST;
 };
 
 std::vector<ReferenceFinder::Reference>
 findRefs(const std::vector<const NamedDecl *> &Decls, ParsedAST &AST) {
-  ReferenceFinder RefFinder(AST.getASTContext(), AST.getPreprocessor(), Decls);
+  ReferenceFinder RefFinder(AST, Decls);
   index::IndexingOptions IndexOpts;
   IndexOpts.SystemSymbolFilter =
       index::IndexingOptions::SystemSymbolFilterKind::All;
@@ -428,18 +466,15 @@ std::vector<DocumentHighlight> findDocumentHighlights(ParsedAST &AST,
   // different kinds, deduplicate them.
   std::vector<DocumentHighlight> Result;
   for (const auto &Ref : References) {
-    if (auto Range =
-            getTokenRange(AST.getSourceManager(), AST.getLangOpts(), Ref.Loc)) {
-      DocumentHighlight DH;
-      DH.range = *Range;
-      if (Ref.Role & index::SymbolRoleSet(index::SymbolRole::Write))
-        DH.kind = DocumentHighlightKind::Write;
-      else if (Ref.Role & index::SymbolRoleSet(index::SymbolRole::Read))
-        DH.kind = DocumentHighlightKind::Read;
-      else
-        DH.kind = DocumentHighlightKind::Text;
-      Result.push_back(std::move(DH));
-    }
+    DocumentHighlight DH;
+    DH.range = Ref.range(SM);
+    if (Ref.Role & index::SymbolRoleSet(index::SymbolRole::Write))
+      DH.kind = DocumentHighlightKind::Write;
+    else if (Ref.Role & index::SymbolRoleSet(index::SymbolRole::Read))
+      DH.kind = DocumentHighlightKind::Read;
+    else
+      DH.kind = DocumentHighlightKind::Text;
+    Result.push_back(std::move(DH));
   }
   return Result;
 }
@@ -502,16 +537,15 @@ ReferencesResult findReferences(ParsedAST &AST, Position Pos, uint32_t Limit,
     MainFileRefs.erase(std::unique(MainFileRefs.begin(), MainFileRefs.end(),
                                    [](const ReferenceFinder::Reference &L,
                                       const ReferenceFinder::Reference &R) {
-                                     return L.Loc == R.Loc;
+                                     return L.SpelledTok.location() ==
+                                            R.SpelledTok.location();
                                    }),
                        MainFileRefs.end());
     for (const auto &Ref : MainFileRefs) {
-      if (auto Range = getTokenRange(SM, AST.getLangOpts(), Ref.Loc)) {
-        Location Result;
-        Result.range = *Range;
-        Result.uri = URIMainFile;
-        Results.References.push_back(std::move(Result));
-      }
+      Location Result;
+      Result.range = Ref.range(SM);
+      Result.uri = URIMainFile;
+      Results.References.push_back(std::move(Result));
     }
     if (Index && Results.References.size() <= Limit) {
       for (const Decl *D : Decls) {
diff --git a/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
index bdfd0e4743d64f..398b6f29dba85d 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
@@ -16,6 +16,7 @@
 #include "SourceCode.h"
 #include "refactor/Tweak.h"
 #include "clang/AST/ASTTypeTraits.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/DeclCXX.h"
@@ -156,7 +157,7 @@ getFunctionSourceCode(const FunctionDecl *FD, llvm::StringRef TargetNamespace,
         "define outline: couldn't find a context for target");
 
   llvm::Error Errors = llvm::Error::success();
-  tooling::Replacements QualifierInsertions;
+  tooling::Replacements DeclarationCleanups;
 
   // Finds the first unqualified name in function return type and name, then
   // qualifies those to be valid in TargetContext.
@@ -181,7 +182,7 @@ getFunctionSourceCode(const FunctionDecl *FD, llvm::StringRef TargetNamespace,
     const NamedDecl *ND = Ref.Targets.front();
     const std::string Qualifier = getQualification(
         AST, *TargetContext, SM.getLocForStartOfFile(SM.getMainFileID()), ND);
-    if (auto Err = QualifierInsertions.add(
+    if (auto Err = DeclarationCleanups.add(
             tooling::Replacement(SM, Ref.NameLoc, 0, Qualifier)))
       Errors = llvm::joinErrors(std::move(Errors), std::move(Err));
   });
@@ -206,14 +207,72 @@ getFunctionSourceCode(const FunctionDecl *FD, llvm::StringRef TargetNamespace,
       assert(Tok != Tokens.rend());
       DelRange.setBegin(Tok->location());
       if (auto Err =
-              QualifierInsertions.add(tooling::Replacement(SM, DelRange, "")))
+              DeclarationCleanups.add(tooling::Replacement(SM, DelRange, "")))
         Errors = llvm::joinErrors(std::move(Errors), std::move(Err));
     }
   }
 
+  auto DelAttr = [&](const Attr *A) {
+    if (!A)
+      return;
+    auto AttrTokens =
+        TokBuf.spelledForExpanded(TokBuf.expandedTokens(A->getRange()));
+    assert(A->getLocation().isValid());
+    if (!AttrTokens || AttrTokens->empty()) {
+      Errors = llvm::joinErrors(
+          std::move(Errors),
+          llvm::createStringError(
+              llvm::inconvertibleErrorCode(),
+              llvm::StringRef("define outline: Can't move out of line as "
+                              "function has a macro `") +
+                  A->getSpelling() + "` specifier."));
+      return;
+    }
+    CharSourceRange DelRange =
+        syntax::Token::range(SM, AttrTokens->front(), AttrTokens->back())
+            .toCharRange(SM);
+    if (auto Err =
+            DeclarationCleanups.add(tooling::Replacement(SM, DelRange, "")))
+      Errors = llvm::joinErrors(std::move(Errors), std::move(Err));
+  };
+
+  DelAttr(FD->getAttr<OverrideAttr>());
+  DelAttr(FD->getAttr<FinalAttr>());
+
+  if (FD->isVirtualAsWritten()) {
+    SourceRange SpecRange{FD->getBeginLoc(), FD->getLocation()};
+    bool HasErrors = true;
+
+    // Clang allows duplicating virtual specifiers so check for multiple
+    // occurances.
+    for (const auto &Tok : TokBuf.expandedTokens(SpecRange)) {
+      if (Tok.kind() != tok::kw_virtual)
+        continue;
+      auto Spelling = TokBuf.spelledForExpanded(llvm::makeArrayRef(Tok));
+      if (!Spelling) {
+        HasErrors = true;
+        break;
+      }
+      HasErrors = false;
+      CharSourceRange DelRange =
+          syntax::Token::range(SM, Spelling->front(), Spelling->back())
+              .toCharRange(SM);
+      if (auto Err =
+              DeclarationCleanups.add(tooling::Replacement(SM, DelRange, "")))
+        Errors = llvm::joinErrors(std::move(Errors), std::move(Err));
+    }
+    if (HasErrors) {
+      Errors = llvm::joinErrors(
+          std::move(Errors),
+          llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                  "define outline: Can't move out of line as "
+                                  "function has a macro `virtual` specifier."));
+    }
+  }
+
   if (Errors)
     return std::move(Errors);
-  return getFunctionSourceAfterReplacements(FD, QualifierInsertions);
+  return getFunctionSourceAfterReplacements(FD, DeclarationCleanups);
 }
 
 struct InsertionPoint {
diff --git a/clang-tools-extra/clangd/test/initialize-params.test b/clang-tools-extra/clangd/test/initialize-params.test
index 68e3ebc24a2145..2b5c02fc8ce287 100644
--- a/clang-tools-extra/clangd/test/initialize-params.test
+++ b/clang-tools-extra/clangd/test/initialize-params.test
@@ -47,6 +47,10 @@
 # CHECK-NEXT:      "textDocumentSync": 2,
 # CHECK-NEXT:      "typeHierarchyProvider": true
 # CHECK-NEXT:      "workspaceSymbolProvider": true
+# CHECK-NEXT:    },
+# CHECK-NEXT:    "serverInfo": {
+# CHECK-NEXT:      "name": "clangd",
+# CHECK-NEXT:      "version": "{{.*}}clangd version {{.*}}"
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
 ---
diff --git a/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp b/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp
index 2140997679904b..9e5952fe2cb53b 100644
--- a/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp
@@ -138,9 +138,7 @@ TEST_F(TUSchedulerTests, MissingFiles) {
   auto Missing = testPath("missing.cpp");
   Files[Missing] = "";
 
-  EXPECT_EQ(S.getContents(Added), "");
   S.update(Added, getInputs(Added, "x"), WantDiagnostics::No);
-  EXPECT_EQ(S.getContents(Added), "x");
 
   // Assert each operation for missing file is an error (even if it's
   // available in VFS).
@@ -159,9 +157,7 @@ TEST_F(TUSchedulerTests, MissingFiles) {
                     [&](Expected<InputsAndPreamble> Preamble) {
                       EXPECT_TRUE(bool(Preamble));
                     });
-  EXPECT_EQ(S.getContents(Added), "x");
   S.remove(Added);
-  EXPECT_EQ(S.getContents(Added), "");
 
   // Assert that all operations fail after removing the file.
   S.runWithAST("", Added,
diff --git a/clang-tools-extra/clangd/unittests/TweakTests.cpp b/clang-tools-extra/clangd/unittests/TweakTests.cpp
index 24210aaa101d42..cae922ffcb9558 100644
--- a/clang-tools-extra/clangd/unittests/TweakTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TweakTests.cpp
@@ -2068,6 +2068,80 @@ TEST_F(DefineOutlineTest, ApplyTest) {
               };)cpp",
           "Foo::Foo(int z) __attribute__((weak)) : bar(2){}\n",
       },
+      // Virt specifiers.
+      {
+          R"cpp(
+            struct A {
+              virtual void f^oo() {}
+            };)cpp",
+          R"cpp(
+            struct A {
+              virtual void foo() ;
+            };)cpp",
+          " void A::foo() {}\n",
+      },
+      {
+          R"cpp(
+            struct A {
+              virtual virtual void virtual f^oo() {}
+            };)cpp",
+          R"cpp(
+            struct A {
+              virtual virtual void virtual foo() ;
+            };)cpp",
+          "  void  A::foo() {}\n",
+      },
+      {
+          R"cpp(
+            struct A {
+              virtual void foo() = 0;
+            };
+            struct B : A {
+              void fo^o() override {}
+            };)cpp",
+          R"cpp(
+            struct A {
+              virtual void foo() = 0;
+            };
+            struct B : A {
+              void foo() override ;
+            };)cpp",
+          "void B::foo()  {}\n",
+      },
+      {
+          R"cpp(
+            struct A {
+              virtual void foo() = 0;
+            };
+            struct B : A {
+              void fo^o() final {}
+            };)cpp",
+          R"cpp(
+            struct A {
+              virtual void foo() = 0;
+            };
+            struct B : A {
+              void foo() final ;
+            };)cpp",
+          "void B::foo()  {}\n",
+      },
+      {
+          R"cpp(
+            struct A {
+              virtual void foo() = 0;
+            };
+            struct B : A {
+              void fo^o() final override {}
+            };)cpp",
+          R"cpp(
+            struct A {
+              virtual void foo() = 0;
+            };
+            struct B : A {
+              void foo() final override ;
+            };)cpp",
+          "void B::foo()   {}\n",
+      },
   };
   for (const auto &Case : Cases) {
     SCOPED_TRACE(Case.Test);
@@ -2081,6 +2155,8 @@ TEST_F(DefineOutlineTest, HandleMacros) {
   llvm::StringMap<std::string> EditedFiles;
   ExtraFiles["Test.cpp"] = "";
   FileName = "Test.hpp";
+  ExtraArgs.push_back("-DVIRTUAL=virtual");
+  ExtraArgs.push_back("-DOVER=override");
 
   struct {
     llvm::StringRef Test;
@@ -2118,6 +2194,48 @@ TEST_F(DefineOutlineTest, HandleMacros) {
           #define TARGET foo
           void TARGET();)cpp",
        "void TARGET(){ return; }"},
+      {R"cpp(#define VIRT virtual
+          struct A {
+            VIRT void f^oo() {}
+          };)cpp",
+       R"cpp(#define VIRT virtual
+          struct A {
+            VIRT void foo() ;
+          };)cpp",
+       " void A::foo() {}\n"},
+      {R"cpp(
+          struct A {
+            VIRTUAL void f^oo() {}
+          };)cpp",
+       R"cpp(
+          struct A {
+            VIRTUAL void foo() ;
+          };)cpp",
+       " void A::foo() {}\n"},
+      {R"cpp(
+          struct A {
+            virtual void foo() = 0;
+          };
+          struct B : A {
+            void fo^o() OVER {}
+          };)cpp",
+       R"cpp(
+          struct A {
+            virtual void foo() = 0;
+          };
+          struct B : A {
+            void foo() OVER ;
+          };)cpp",
+       "void B::foo()  {}\n"},
+      {R"cpp(#define STUPID_MACRO(X) virtual
+          struct A {
+            STUPID_MACRO(sizeof sizeof int) void f^oo() {}
+          };)cpp",
+       R"cpp(#define STUPID_MACRO(X) virtual
+          struct A {
+            STUPID_MACRO(sizeof sizeof int) void foo() ;
+          };)cpp",
+       " void A::foo() {}\n"},
   };
   for (const auto &Case : Cases) {
     SCOPED_TRACE(Case.Test);
@@ -2229,6 +2347,49 @@ TEST_F(DefineOutlineTest, QualifyFunctionName) {
         << Case.TestHeader;
   }
 }
+
+TEST_F(DefineOutlineTest, FailsMacroSpecifier) {
+  FileName = "Test.hpp";
+  ExtraFiles["Test.cpp"] = "";
+  ExtraArgs.push_back("-DFINALOVER=final override");
+
+  std::pair<StringRef, StringRef> Cases[] = {
+      {
+          R"cpp(
+          #define VIRT virtual void
+          struct A {
+            VIRT fo^o() {}
+          };)cpp",
+          "fail: define outline: Can't move out of line as function has a "
+          "macro `virtual` specifier."},
+      {
+          R"cpp(
+          #define OVERFINAL final override
+          struct A {
+            virtual void foo() {}
+          };
+          struct B : A {
+            void fo^o() OVERFINAL {}
+          };)cpp",
+          "fail: define outline: Can't move out of line as function has a "
+          "macro `override` specifier.\ndefine outline: Can't move out of line "
+          "as function has a macro `final` specifier."},
+      {
+          R"cpp(
+          struct A {
+            virtual void foo() {}
+          };
+          struct B : A {
+            void fo^o() FINALOVER {}
+          };)cpp",
+          "fail: define outline: Can't move out of line as function has a "
+          "macro `override` specifier.\ndefine outline: Can't move out of line "
+          "as function has a macro `final` specifier."},
+  };
+  for (const auto &Case : Cases) {
+    EXPECT_EQ(apply(Case.first), Case.second);
+  }
+}
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 831ef32c6b8a4c..ce121ebe6055bb 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -101,12 +101,10 @@ Windows Support
 C Language Changes in Clang
 ---------------------------
 
-- ...
-
-C11 Feature Support
-^^^^^^^^^^^^^^^^^^^
+- The default C language standard used when `-std=` is not specified has been
+  upgraded from gnu11 to gnu17.
 
-...
+- ...
 
 C++ Language Changes in Clang
 -----------------------------
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 856d5e34bbcc26..f50f8888f477ed 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -80,7 +80,7 @@ Basic Usage
 Intro to how to use a C compiler for newbies.
 
 compile + link compile then link debug info enabling optimizations
-picking a language to use, defaults to C11 by default. Autosenses based
+picking a language to use, defaults to C17 by default. Autosenses based
 on extension. using a makefile
 
 Command Line Options
@@ -2399,10 +2399,10 @@ See :doc:`LanguageExtensions`.
 Differences between various standard modes
 ------------------------------------------
 
-clang supports the -std option, which changes what language mode clang
-uses. The supported modes for C are c89, gnu89, c99, gnu99, c11, gnu11,
-c17, gnu17, and various aliases for those modes. If no -std option is
-specified, clang defaults to gnu11 mode. Many C99 and C11 features are
+clang supports the -std option, which changes what language mode clang uses.
+The supported modes for C are c89, gnu89, c99, gnu99, c11, gnu11, c17, gnu17,
+c2x, gnu2x, and various aliases for those modes. If no -std option is
+specified, clang defaults to gnu17 mode. Many C99 and C11 features are
 supported in earlier modes as a conforming extension, with a warning. Use
 ``-pedantic-errors`` to request an error if a feature from a later standard
 revision is used in an earlier mode.
diff --git a/clang/docs/analyzer/developer-docs/DebugChecks.rst b/clang/docs/analyzer/developer-docs/DebugChecks.rst
index 3f9bed78604f0d..05b3e2480d3b7e 100644
--- a/clang/docs/analyzer/developer-docs/DebugChecks.rst
+++ b/clang/docs/analyzer/developer-docs/DebugChecks.rst
@@ -275,6 +275,28 @@ ExprInspection checks
 
   See clang_analyzer_denote().
 
+- ``void clang_analyzer_isTainted(a single argument of any type);``
+
+  Queries the analyzer whether the expression used as argument is tainted or not.
+  This is useful in tests, where we don't want to issue warning for all tainted
+  expressions but only check for certain expressions.
+  This would help to reduce the *noise* that the `TaintTest` debug checker would
+  introduce and let you focus on the `expected-warning`s that you really care
+  about.
+
+  Example usage::
+
+    int read_integer() {
+      int n;
+      clang_analyzer_isTainted(n);     // expected-warning{{NO}}
+      scanf("%d", &n);
+      clang_analyzer_isTainted(n);     // expected-warning{{YES}}
+      clang_analyzer_isTainted(n + 2); // expected-warning{{YES}}
+      clang_analyzer_isTainted(n > 0); // expected-warning{{YES}}
+      int next_tainted_value = n; // no-warning
+      return n;
+    }
+
 Statistics
 ==========
 
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index efb96f3cc5b6bf..9d4930a3887a74 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2574,7 +2574,11 @@ enum CXCursorKind {
    */
   CXCursor_OMPParallelMasterDirective      = 285,
 
-  CXCursor_LastStmt = CXCursor_OMPParallelMasterDirective,
+  /** OpenMP depobj directive.
+   */
+  CXCursor_OMPDepobjDirective             = 286,
+
+  CXCursor_LastStmt = CXCursor_OMPDepobjDirective,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 453c068bbeb074..fa727837a80298 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -4108,6 +4108,92 @@ class OMPFlushClause final
   }
 };
 
+/// This represents implicit clause 'depobj' for the '#pragma omp depobj'
+/// directive.
+/// This clause does not exist by itself, it can be only as a part of 'omp
+/// depobj' directive. This clause is introduced to keep the original structure
+/// of \a OMPExecutableDirective class and its derivatives and to use the
+/// existing infrastructure of clauses with the list of variables.
+///
+/// \code
+/// #pragma omp depobj(a) destroy
+/// \endcode
+/// In this example directive '#pragma omp depobj' has implicit clause 'depobj'
+/// with the depobj 'a'.
+class OMPDepobjClause final : public OMPClause {
+  friend class OMPClauseReader;
+
+  /// Location of '('.
+  SourceLocation LParenLoc;
+
+  /// Chunk size.
+  Expr *Depobj = nullptr;
+
+  /// Build clause with number of variables \a N.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc Ending location of the clause.
+  OMPDepobjClause(SourceLocation StartLoc, SourceLocation LParenLoc,
+                  SourceLocation EndLoc)
+      : OMPClause(OMPC_depobj, StartLoc, EndLoc), LParenLoc(LParenLoc) {}
+
+  /// Build an empty clause.
+  ///
+  explicit OMPDepobjClause()
+      : OMPClause(OMPC_depobj, SourceLocation(), SourceLocation()) {}
+
+  void setDepobj(Expr *E) { Depobj = E; }
+
+  /// Sets the location of '('.
+  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+
+public:
+  /// Creates clause.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc Ending location of the clause.
+  /// \param Depobj depobj expression associated with the 'depobj' directive.
+  static OMPDepobjClause *Create(const ASTContext &C, SourceLocation StartLoc,
+                                 SourceLocation LParenLoc,
+                                 SourceLocation EndLoc, Expr *Depobj);
+
+  /// Creates an empty clause.
+  ///
+  /// \param C AST context.
+  static OMPDepobjClause *CreateEmpty(const ASTContext &C);
+
+  /// Returns depobj expression associated with the clause.
+  Expr *getDepobj() { return Depobj; }
+  const Expr *getDepobj() const { return Depobj; }
+
+  /// Returns the location of '('.
+  SourceLocation getLParenLoc() const { return LParenLoc; }
+
+  child_range children() {
+    return child_range(reinterpret_cast<Stmt **>(&Depobj),
+                       reinterpret_cast<Stmt **>(&Depobj) + 1);
+  }
+
+  const_child_range children() const {
+    auto Children = const_cast<OMPDepobjClause *>(this)->children();
+    return const_child_range(Children.begin(), Children.end());
+  }
+
+  child_range used_children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+  const_child_range used_children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == OMPC_depobj;
+  }
+};
+
 /// This represents implicit clause 'depend' for the '#pragma omp task'
 /// directive.
 ///
@@ -6607,6 +6693,46 @@ class OMPOrderClause final : public OMPClause {
   }
 };
 
+/// This represents 'destroy' clause in the '#pragma omp depobj'
+/// directive.
+///
+/// \code
+/// #pragma omp depobj(a) destroy
+/// \endcode
+/// In this example directive '#pragma omp depobj' has 'destroy' clause.
+class OMPDestroyClause final : public OMPClause {
+public:
+  /// Build 'destroy' clause.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  OMPDestroyClause(SourceLocation StartLoc, SourceLocation EndLoc)
+      : OMPClause(OMPC_destroy, StartLoc, EndLoc) {}
+
+  /// Build an empty clause.
+  OMPDestroyClause()
+      : OMPClause(OMPC_destroy, SourceLocation(), SourceLocation()) {}
+
+  child_range children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  const_child_range children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  child_range used_children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+  const_child_range used_children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == OMPC_destroy;
+  }
+};
+
 /// This class implements a simple visitor for OMPClause
 /// subclasses.
 template<class ImplClass, template <typename> class Ptr, typename RetTy>
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 29b2c354100209..3dc9af4b804209 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2842,6 +2842,9 @@ DEF_TRAVERSE_STMT(OMPCancelDirective,
 DEF_TRAVERSE_STMT(OMPFlushDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
+DEF_TRAVERSE_STMT(OMPDepobjDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
 DEF_TRAVERSE_STMT(OMPOrderedDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
@@ -3156,6 +3159,11 @@ bool RecursiveASTVisitor<Derived>::VisitOMPNogroupClause(OMPNogroupClause *) {
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPDestroyClause(OMPDestroyClause *) {
+  return true;
+}
+
 template <typename Derived>
 template <typename T>
 bool RecursiveASTVisitor<Derived>::VisitOMPClauseList(T *Node) {
@@ -3347,6 +3355,12 @@ bool RecursiveASTVisitor<Derived>::VisitOMPFlushClause(OMPFlushClause *C) {
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPDepobjClause(OMPDepobjClause *C) {
+  TRY_TO(TraverseStmt(C->getDepobj()));
+  return true;
+}
+
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPDependClause(OMPDependClause *C) {
   TRY_TO(VisitOMPClauseList(C));
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index 55649079bd2b17..5f7589acdb9e34 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -2314,6 +2314,64 @@ class OMPFlushDirective : public OMPExecutableDirective {
   }
 };
 
+/// This represents '#pragma omp depobj' directive.
+///
+/// \code
+/// #pragma omp depobj(a) depend(in:x,y)
+/// \endcode
+/// In this example directive '#pragma omp  depobj' initializes a depobj object
+/// 'a' with dependence type 'in' and a list with 'x' and 'y' locators.
+class OMPDepobjDirective final : public OMPExecutableDirective {
+  friend class ASTStmtReader;
+
+  /// Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  /// \param NumClauses Number of clauses.
+  ///
+  OMPDepobjDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                     unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPDepobjDirectiveClass,
+                               llvm::omp::OMPD_depobj, StartLoc, EndLoc,
+                               NumClauses, 0) {}
+
+  /// Build an empty directive.
+  ///
+  /// \param NumClauses Number of clauses.
+  ///
+  explicit OMPDepobjDirective(unsigned NumClauses)
+      : OMPExecutableDirective(this, OMPDepobjDirectiveClass,
+                               llvm::omp::OMPD_depobj, SourceLocation(),
+                               SourceLocation(), NumClauses, 0) {}
+
+public:
+  /// Creates directive with a list of \a Clauses.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  /// \param Clauses List of clauses.
+  ///
+  static OMPDepobjDirective *Create(const ASTContext &C,
+                                    SourceLocation StartLoc,
+                                    SourceLocation EndLoc,
+                                    ArrayRef<OMPClause *> Clauses);
+
+  /// Creates an empty directive with the place for \a NumClauses
+  /// clauses.
+  ///
+  /// \param C AST context.
+  /// \param NumClauses Number of clauses.
+  ///
+  static OMPDepobjDirective *CreateEmpty(const ASTContext &C,
+                                         unsigned NumClauses, EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPDepobjDirectiveClass;
+  }
+};
+
 /// This represents '#pragma omp ordered' directive.
 ///
 /// \code
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 2d6978f3f41343..e6155d5d0e1018 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10031,8 +10031,14 @@ def note_omp_invalid_subscript_on_this_ptr_map : Note <
   "expected 'this' subscript expression on map clause to be 'this[0]'">;
 def err_omp_invalid_map_this_expr : Error <
   "invalid 'this' expression on 'map' clause">;
-def err_implied_omp_allocator_handle_t_not_found : Error<
-  "omp_allocator_handle_t type not found; include <omp.h>">;
+def err_omp_implied_type_not_found : Error<
+  "'%0' type not found; include <omp.h>">;
+def err_omp_expected_omp_depend_t_lvalue : Error<
+  "expected lvalue expression%select{ of 'omp_depend_t' type, not %1|}0">;
+def err_omp_depobj_expected : Error<
+  "expected depobj expression">;
+def err_omp_depobj_single_clause_expected : Error<
+  "exactly one of 'depend', 'destroy', or 'update' clauses is expected">;
 def err_omp_expected_predefined_allocator : Error<
   "expected one of the predefined allocators for the variables with the static "
   "storage: 'omp_default_mem_alloc', 'omp_large_cap_mem_alloc', "
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index f2913fe8e9bbdf..388204c3c1934e 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -209,6 +209,9 @@
 #ifndef OPENMP_FLUSH_CLAUSE
 #define OPENMP_FLUSH_CLAUSE(Name)
 #endif
+#ifndef OPENMP_DEPOBJ_CLAUSE
+#define OPENMP_DEPOBJ_CLAUSE(Name)
+#endif
 
 // OpenMP clauses.
 OPENMP_CLAUSE(allocator, OMPAllocatorClause)
@@ -272,6 +275,8 @@ OPENMP_CLAUSE(atomic_default_mem_order, OMPAtomicDefaultMemOrderClause)
 OPENMP_CLAUSE(allocate, OMPAllocateClause)
 OPENMP_CLAUSE(nontemporal, OMPNontemporalClause)
 OPENMP_CLAUSE(order, OMPOrderClause)
+OPENMP_CLAUSE(depobj, OMPDepobjClause)
+OPENMP_CLAUSE(destroy, OMPDestroyClause)
 
 // Clauses allowed for OpenMP directive 'parallel'.
 OPENMP_PARALLEL_CLAUSE(if)
@@ -1078,6 +1083,11 @@ OPENMP_FLUSH_CLAUSE(acq_rel)
 OPENMP_FLUSH_CLAUSE(acquire)
 OPENMP_FLUSH_CLAUSE(release)
 
+// Clauses allowed for OpenMP directive 'depobj'.
+OPENMP_DEPOBJ_CLAUSE(depend)
+OPENMP_DEPOBJ_CLAUSE(destroy)
+
+#undef OPENMP_DEPOBJ_CLAUSE
 #undef OPENMP_FLUSH_CLAUSE
 #undef OPENMP_ORDER_KIND
 #undef OPENMP_LASTPRIVATE_KIND
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 2333202968198e..41c6dbdb42e958 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -232,6 +232,7 @@ def OMPBarrierDirective : StmtNode<OMPExecutableDirective>;
 def OMPTaskwaitDirective : StmtNode<OMPExecutableDirective>;
 def OMPTaskgroupDirective : StmtNode<OMPExecutableDirective>;
 def OMPFlushDirective : StmtNode<OMPExecutableDirective>;
+def OMPDepobjDirective : StmtNode<OMPExecutableDirective>;
 def OMPOrderedDirective : StmtNode<OMPExecutableDirective>;
 def OMPAtomicDirective : StmtNode<OMPExecutableDirective>;
 def OMPTargetDirective : StmtNode<OMPExecutableDirective>;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index f1dfe411983a6e..9a3fc9585c98e7 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9994,6 +9994,10 @@ class Sema final {
   StmtResult ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
                                        SourceLocation StartLoc,
                                        SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp depobj'.
+  StmtResult ActOnOpenMPDepobjDirective(ArrayRef<OMPClause *> Clauses,
+                                        SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
   /// Called on well-formed '\#pragma omp ordered' after parsing of the
   /// associated statement.
   StmtResult ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
@@ -10340,6 +10344,9 @@ class Sema final {
   /// Called on well-formed 'relaxed' clause.
   OMPClause *ActOnOpenMPRelaxedClause(SourceLocation StartLoc,
                                       SourceLocation EndLoc);
+  /// Called on well-formed 'destroy' clause.
+  OMPClause *ActOnOpenMPDestroyClause(SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
   /// Called on well-formed 'threads' clause.
   OMPClause *ActOnOpenMPThreadsClause(SourceLocation StartLoc,
                                       SourceLocation EndLoc);
@@ -10452,6 +10459,10 @@ class Sema final {
                                     SourceLocation StartLoc,
                                     SourceLocation LParenLoc,
                                     SourceLocation EndLoc);
+  /// Called on well-formed 'depobj' pseudo clause.
+  OMPClause *ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc);
   /// Called on well-formed 'depend' clause.
   OMPClause *
   ActOnOpenMPDependClause(OpenMPDependClauseKind DepKind, SourceLocation DepLoc,
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 83af4d15e27b59..0767fb8e22c18a 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1825,6 +1825,7 @@ namespace serialization {
       STMT_OMP_BARRIER_DIRECTIVE,
       STMT_OMP_TASKWAIT_DIRECTIVE,
       STMT_OMP_FLUSH_DIRECTIVE,
+      STMT_OMP_DEPOBJ_DIRECTIVE,
       STMT_OMP_ORDERED_DIRECTIVE,
       STMT_OMP_ATOMIC_DIRECTIVE,
       STMT_OMP_TARGET_DIRECTIVE,
diff --git a/clang/include/clang/Tooling/Syntax/Tokens.h b/clang/include/clang/Tooling/Syntax/Tokens.h
index 19d120ebbc9f81..2ee84007481082 100644
--- a/clang/include/clang/Tooling/Syntax/Tokens.h
+++ b/clang/include/clang/Tooling/Syntax/Tokens.h
@@ -245,6 +245,10 @@ class TokenBuffer {
   ///     "DECL", "(", "a", ")", ";"}
   llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
 
+  /// Returns the spelled Token starting at Loc, if there are no such tokens
+  /// returns nullptr.
+  const syntax::Token *spelledTokenAt(SourceLocation Loc) const;
+
   /// Get all tokens that expand a macro in \p FID. For the following input
   ///     #define FOO B
   ///     #define FOO2(X) int X
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index bc4f2b491e11cc..93a8aab7c06855 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -474,10 +474,20 @@ void ASTContext::attachCommentsToJustParsedDecls(ArrayRef<Decl *> Decls,
   if (Comments.empty() || Decls.empty())
     return;
 
-  // See if there are any new comments that are not attached to a decl.
-  // The location doesn't have to be precise - we care only about the file.
-  const FileID File =
-      SourceMgr.getDecomposedLoc((*Decls.begin())->getLocation()).first;
+  FileID File;
+  for (Decl *D : Decls) {
+    SourceLocation Loc = D->getLocation();
+    if (Loc.isValid()) {
+      // See if there are any new comments that are not attached to a decl.
+      // The location doesn't have to be precise - we care only about the file.
+      File = SourceMgr.getDecomposedLoc(Loc).first;
+      break;
+    }
+  }
+
+  if (File.isInvalid())
+    return;
+
   auto CommentsInThisFile = Comments.getCommentsInFile(File);
   if (!CommentsInThisFile || CommentsInThisFile->empty() ||
       CommentsInThisFile->rbegin()->second->isAttached())
diff --git a/clang/lib/AST/CommentCommandTraits.cpp b/clang/lib/AST/CommentCommandTraits.cpp
index b306fcbb154f3c..bdc0dd47fb7d2f 100644
--- a/clang/lib/AST/CommentCommandTraits.cpp
+++ b/clang/lib/AST/CommentCommandTraits.cpp
@@ -8,6 +8,7 @@
 
 #include "clang/AST/CommentCommandTraits.h"
 #include "llvm/ADT/STLExtras.h"
+#include <cassert>
 
 namespace clang {
 namespace comments {
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index c5c9bc72c294ea..2bd02a0cda4f7c 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -111,6 +111,7 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
   case OMPC_mergeable:
   case OMPC_threadprivate:
   case OMPC_flush:
+  case OMPC_depobj:
   case OMPC_read:
   case OMPC_write:
   case OMPC_update:
@@ -142,6 +143,7 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
   case OMPC_match:
   case OMPC_nontemporal:
   case OMPC_order:
+  case OMPC_destroy:
     break;
   }
 
@@ -189,6 +191,7 @@ const OMPClauseWithPostUpdate *OMPClauseWithPostUpdate::get(const OMPClause *C)
   case OMPC_mergeable:
   case OMPC_threadprivate:
   case OMPC_flush:
+  case OMPC_depobj:
   case OMPC_read:
   case OMPC_write:
   case OMPC_update:
@@ -226,6 +229,7 @@ const OMPClauseWithPostUpdate *OMPClauseWithPostUpdate::get(const OMPClause *C)
   case OMPC_match:
   case OMPC_nontemporal:
   case OMPC_order:
+  case OMPC_destroy:
     break;
   }
 
@@ -835,6 +839,20 @@ OMPFlushClause *OMPFlushClause::CreateEmpty(const ASTContext &C, unsigned N) {
   return new (Mem) OMPFlushClause(N);
 }
 
+OMPDepobjClause *OMPDepobjClause::Create(const ASTContext &C,
+                                         SourceLocation StartLoc,
+                                         SourceLocation LParenLoc,
+                                         SourceLocation RParenLoc,
+                                         Expr *Depobj) {
+  auto *Clause = new (C) OMPDepobjClause(StartLoc, LParenLoc, RParenLoc);
+  Clause->setDepobj(Depobj);
+  return Clause;
+}
+
+OMPDepobjClause *OMPDepobjClause::CreateEmpty(const ASTContext &C) {
+  return new (C) OMPDepobjClause();
+}
+
 OMPDependClause *
 OMPDependClause::Create(const ASTContext &C, SourceLocation StartLoc,
                         SourceLocation LParenLoc, SourceLocation EndLoc,
@@ -1407,6 +1425,10 @@ void OMPClausePrinter::VisitOMPHintClause(OMPHintClause *Node) {
   OS << ")";
 }
 
+void OMPClausePrinter::VisitOMPDestroyClause(OMPDestroyClause *) {
+  OS << "destroy";
+}
+
 template<typename T>
 void OMPClausePrinter::VisitOMPClauseList(T *Node, char StartSym) {
   for (typename T::varlist_iterator I = Node->varlist_begin(),
@@ -1597,6 +1619,12 @@ void OMPClausePrinter::VisitOMPFlushClause(OMPFlushClause *Node) {
   }
 }
 
+void OMPClausePrinter::VisitOMPDepobjClause(OMPDepobjClause *Node) {
+  OS << "(";
+  Node->getDepobj()->printPretty(OS, nullptr, Policy, 0);
+  OS << ")";
+}
+
 void OMPClausePrinter::VisitOMPDependClause(OMPDependClause *Node) {
   OS << "depend(";
   OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(),
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 15bedb9791df9f..153d492598d3fd 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -10,9 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/AST/StmtOpenMP.h"
-
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/StmtOpenMP.h"
 
 using namespace clang;
 using namespace llvm::omp;
@@ -759,6 +758,29 @@ OMPFlushDirective *OMPFlushDirective::CreateEmpty(const ASTContext &C,
   return new (Mem) OMPFlushDirective(NumClauses);
 }
 
+OMPDepobjDirective *OMPDepobjDirective::Create(const ASTContext &C,
+                                               SourceLocation StartLoc,
+                                               SourceLocation EndLoc,
+                                               ArrayRef<OMPClause *> Clauses) {
+  unsigned Size =
+      llvm::alignTo(sizeof(OMPDepobjDirective), alignof(OMPClause *));
+  void *Mem = C.Allocate(Size + sizeof(OMPClause *) * Clauses.size(),
+                         alignof(OMPDepobjDirective));
+  auto *Dir = new (Mem) OMPDepobjDirective(StartLoc, EndLoc, Clauses.size());
+  Dir->setClauses(Clauses);
+  return Dir;
+}
+
+OMPDepobjDirective *OMPDepobjDirective::CreateEmpty(const ASTContext &C,
+                                                    unsigned NumClauses,
+                                                    EmptyShell) {
+  unsigned Size =
+      llvm::alignTo(sizeof(OMPDepobjDirective), alignof(OMPClause *));
+  void *Mem = C.Allocate(Size + sizeof(OMPClause *) * NumClauses,
+                         alignof(OMPDepobjDirective));
+  return new (Mem) OMPDepobjDirective(NumClauses);
+}
+
 OMPOrderedDirective *OMPOrderedDirective::Create(const ASTContext &C,
                                                  SourceLocation StartLoc,
                                                  SourceLocation EndLoc,
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 76295ae13b6004..f7a97c2743c190 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -753,6 +753,11 @@ void StmtPrinter::VisitOMPFlushDirective(OMPFlushDirective *Node) {
   PrintOMPExecutableDirective(Node);
 }
 
+void StmtPrinter::VisitOMPDepobjDirective(OMPDepobjDirective *Node) {
+  Indent() << "#pragma omp depobj";
+  PrintOMPExecutableDirective(Node);
+}
+
 void StmtPrinter::VisitOMPOrderedDirective(OMPOrderedDirective *Node) {
   Indent() << "#pragma omp ordered";
   PrintOMPExecutableDirective(Node, Node->hasClausesOfKind<OMPDependClause>());
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 76c5fe2e540298..9f119837023599 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -532,6 +532,8 @@ void OMPClauseProfiler::VisitOMPSIMDClause(const OMPSIMDClause *) {}
 
 void OMPClauseProfiler::VisitOMPNogroupClause(const OMPNogroupClause *) {}
 
+void OMPClauseProfiler::VisitOMPDestroyClause(const OMPDestroyClause *) {}
+
 template<typename T>
 void OMPClauseProfiler::VisitOMPClauseList(T *Node) {
   for (auto *E : Node->varlists()) {
@@ -719,6 +721,10 @@ OMPClauseProfiler::VisitOMPCopyprivateClause(const OMPCopyprivateClause *C) {
 void OMPClauseProfiler::VisitOMPFlushClause(const OMPFlushClause *C) {
   VisitOMPClauseList(C);
 }
+void OMPClauseProfiler::VisitOMPDepobjClause(const OMPDepobjClause *C) {
+  if (const Expr *Depobj = C->getDepobj())
+    Profiler->VisitStmt(Depobj);
+}
 void OMPClauseProfiler::VisitOMPDependClause(const OMPDependClause *C) {
   VisitOMPClauseList(C);
 }
@@ -885,6 +891,10 @@ void StmtProfiler::VisitOMPFlushDirective(const OMPFlushDirective *S) {
   VisitOMPExecutableDirective(S);
 }
 
+void StmtProfiler::VisitOMPDepobjDirective(const OMPDepobjDirective *S) {
+  VisitOMPExecutableDirective(S);
+}
+
 void StmtProfiler::VisitOMPOrderedDirective(const OMPOrderedDirective *S) {
   VisitOMPExecutableDirective(S);
 }
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index e7c3a8a8021ed0..8de233c191350f 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -25,7 +25,14 @@ OpenMPClauseKind clang::getOpenMPClauseKind(StringRef Str) {
   // clause for 'flush' directive. If the 'flush' clause is explicitly specified
   // the Parser should generate a warning about extra tokens at the end of the
   // directive.
-  if (Str == "flush")
+  // 'depobj' clause cannot be specified explicitly, because this is an implicit
+  // clause for 'depobj' directive. If the 'depobj' clause is explicitly
+  // specified the Parser should generate a warning about extra tokens at the
+  // end of the directive.
+  if (llvm::StringSwitch<bool>(Str)
+          .Case("flush", true)
+          .Case("depobj", true)
+          .Default(false))
     return OMPC_unknown;
   return llvm::StringSwitch<OpenMPClauseKind>(Str)
 #define OPENMP_CLAUSE(Name, Class) .Case(#Name, OMPC_##Name)
@@ -166,6 +173,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind,
   case OMPC_untied:
   case OMPC_mergeable:
   case OMPC_flush:
+  case OMPC_depobj:
   case OMPC_read:
   case OMPC_write:
   case OMPC_update:
@@ -194,6 +202,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind,
   case OMPC_dynamic_allocators:
   case OMPC_match:
   case OMPC_nontemporal:
+  case OMPC_destroy:
     break;
   }
   llvm_unreachable("Invalid OpenMP simple clause kind");
@@ -380,6 +389,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
   case OMPC_untied:
   case OMPC_mergeable:
   case OMPC_flush:
+  case OMPC_depobj:
   case OMPC_read:
   case OMPC_write:
   case OMPC_update:
@@ -408,6 +418,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
   case OMPC_dynamic_allocators:
   case OMPC_match:
   case OMPC_nontemporal:
+  case OMPC_destroy:
     break;
   }
   llvm_unreachable("Invalid OpenMP simple clause kind");
@@ -553,6 +564,20 @@ bool clang::isAllowedClauseForDirective(OpenMPDirectiveKind DKind,
       break;
     }
     break;
+  case OMPD_depobj:
+    if (OpenMPVersion < 50)
+      return false;
+    switch (CKind) {
+#define OPENMP_DEPOBJ_CLAUSE(Name)                                             \
+  case OMPC_##Name:                                                            \
+    return true;
+#include "clang/Basic/OpenMPKinds.def"
+    case OMPC_depobj:
+      return true;
+    default:
+      break;
+    }
+    break;
   case OMPD_atomic:
     if (OpenMPVersion < 50 &&
         (CKind == OMPC_acq_rel || CKind == OMPC_acquire ||
@@ -1195,6 +1220,7 @@ void clang::getOpenMPCaptureRegions(
   case OMPD_cancellation_point:
   case OMPD_cancel:
   case OMPD_flush:
+  case OMPD_depobj:
   case OMPD_declare_reduction:
   case OMPD_declare_mapper:
   case OMPD_declare_simd:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 79c2f1b107842e..9fe03069a44e1c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6703,6 +6703,7 @@ emitNumTeamsForTargetDirective(CodeGenFunction &CGF,
   case OMPD_taskgroup:
   case OMPD_atomic:
   case OMPD_flush:
+  case OMPD_depobj:
   case OMPD_teams:
   case OMPD_target_data:
   case OMPD_target_exit_data:
@@ -7014,6 +7015,7 @@ emitNumThreadsForTargetDirective(CodeGenFunction &CGF,
   case OMPD_taskgroup:
   case OMPD_atomic:
   case OMPD_flush:
+  case OMPD_depobj:
   case OMPD_teams:
   case OMPD_target_data:
   case OMPD_target_exit_data:
@@ -8798,6 +8800,7 @@ getNestedDistributeDirective(ASTContext &Ctx, const OMPExecutableDirective &D) {
     case OMPD_taskgroup:
     case OMPD_atomic:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_teams:
     case OMPD_target_data:
     case OMPD_target_exit_data:
@@ -9561,6 +9564,7 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
     case OMPD_taskgroup:
     case OMPD_atomic:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_teams:
     case OMPD_target_data:
     case OMPD_target_exit_data:
@@ -10201,6 +10205,7 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
     case OMPD_taskgroup:
     case OMPD_atomic:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_teams:
     case OMPD_target_data:
     case OMPD_distribute:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 867bfb0727367b..b139529d8eb349 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -786,6 +786,7 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx,
     case OMPD_taskgroup:
     case OMPD_atomic:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_teams:
     case OMPD_target_data:
     case OMPD_target_exit_data:
@@ -862,6 +863,7 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx,
   case OMPD_taskgroup:
   case OMPD_atomic:
   case OMPD_flush:
+  case OMPD_depobj:
   case OMPD_teams:
   case OMPD_target_data:
   case OMPD_target_exit_data:
@@ -1031,6 +1033,7 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx,
     case OMPD_taskgroup:
     case OMPD_atomic:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_teams:
     case OMPD_target_data:
     case OMPD_target_exit_data:
@@ -1113,6 +1116,7 @@ static bool supportsLightweightRuntime(ASTContext &Ctx,
   case OMPD_taskgroup:
   case OMPD_atomic:
   case OMPD_flush:
+  case OMPD_depobj:
   case OMPD_teams:
   case OMPD_target_data:
   case OMPD_target_exit_data:
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 238e04999499a4..a334bab06783ee 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -247,6 +247,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OMPFlushDirectiveClass:
     EmitOMPFlushDirective(cast<OMPFlushDirective>(*S));
     break;
+  case Stmt::OMPDepobjDirectiveClass:
+    EmitOMPDepobjDirective(cast<OMPDepobjDirective>(*S));
+    break;
   case Stmt::OMPOrderedDirectiveClass:
     EmitOMPOrderedDirective(cast<OMPOrderedDirective>(*S));
     break;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index bcd2d0635caf16..bab7c6d0dcde08 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -3800,6 +3800,8 @@ void CodeGenFunction::EmitOMPFlushDirective(const OMPFlushDirective &S) {
       S.getBeginLoc(), AO);
 }
 
+void CodeGenFunction::EmitOMPDepobjDirective(const OMPDepobjDirective &S) {}
+
 void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S,
                                             const CodeGenLoopTy &CodeGenLoop,
                                             Expr *IncExpr) {
@@ -4543,6 +4545,7 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
   case OMPC_copyin:
   case OMPC_copyprivate:
   case OMPC_flush:
+  case OMPC_depobj:
   case OMPC_proc_bind:
   case OMPC_schedule:
   case OMPC_ordered:
@@ -4578,6 +4581,7 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
   case OMPC_match:
   case OMPC_nontemporal:
   case OMPC_order:
+  case OMPC_destroy:
     llvm_unreachable("Clause is not allowed in 'omp atomic'.");
   }
 }
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 14111713ccac18..907b4d744b07a3 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3262,6 +3262,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitOMPTaskwaitDirective(const OMPTaskwaitDirective &S);
   void EmitOMPTaskgroupDirective(const OMPTaskgroupDirective &S);
   void EmitOMPFlushDirective(const OMPFlushDirective &S);
+  void EmitOMPDepobjDirective(const OMPDepobjDirective &S);
   void EmitOMPOrderedDirective(const OMPOrderedDirective &S);
   void EmitOMPAtomicDirective(const OMPAtomicDirective &S);
   void EmitOMPTargetDirective(const OMPTargetDirective &S);
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index c356657541fa3e..d20d6298758919 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -309,7 +309,7 @@ static const char *getLDMOption(const llvm::Triple &T, const ArgList &Args) {
   }
 }
 
-static bool getPIE(const ArgList &Args, const toolchains::Linux &ToolChain) {
+static bool getPIE(const ArgList &Args, const ToolChain &TC) {
   if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_static) ||
       Args.hasArg(options::OPT_r) || Args.hasArg(options::OPT_static_pie))
     return false;
@@ -317,17 +317,16 @@ static bool getPIE(const ArgList &Args, const toolchains::Linux &ToolChain) {
   Arg *A = Args.getLastArg(options::OPT_pie, options::OPT_no_pie,
                            options::OPT_nopie);
   if (!A)
-    return ToolChain.isPIEDefault();
+    return TC.isPIEDefault();
   return A->getOption().matches(options::OPT_pie);
 }
 
-static bool getStaticPIE(const ArgList &Args,
-                         const toolchains::Linux &ToolChain) {
+static bool getStaticPIE(const ArgList &Args, const ToolChain &TC) {
   bool HasStaticPIE = Args.hasArg(options::OPT_static_pie);
   // -no-pie is an alias for -nopie. So, handling -nopie takes care of
   // -no-pie as well.
   if (HasStaticPIE && Args.hasArg(options::OPT_nopie)) {
-    const Driver &D = ToolChain.getDriver();
+    const Driver &D = TC.getDriver();
     const llvm::opt::OptTable &Opts = D.getOpts();
     const char *StaticPIEName = Opts.getOptionName(options::OPT_static_pie);
     const char *NoPIEName = Opts.getOptionName(options::OPT_nopie);
@@ -346,8 +345,12 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                            const InputInfoList &Inputs,
                                            const ArgList &Args,
                                            const char *LinkingOutput) const {
-  const toolchains::Linux &ToolChain =
-      static_cast<const toolchains::Linux &>(getToolChain());
+  // FIXME: The Linker class constructor takes a ToolChain and not a
+  // Generic_ELF, so the static_cast might return a reference to a invalid
+  // instance (see PR45061). Ideally, the Linker constructor needs to take a
+  // Generic_ELF instead.
+  const toolchains::Generic_ELF &ToolChain =
+      static_cast<const toolchains::Generic_ELF &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
 
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
@@ -418,8 +421,7 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (isAndroid)
       CmdArgs.push_back("--warn-shared-textrel");
 
-  for (const auto &Opt : ToolChain.ExtraOpts)
-    CmdArgs.push_back(Opt.c_str());
+  ToolChain.addExtraOpts(CmdArgs);
 
   CmdArgs.push_back("--eh-frame-hdr");
 
diff --git a/clang/lib/Driver/ToolChains/Gnu.h b/clang/lib/Driver/ToolChains/Gnu.h
index 083f74c0547744..fa50b56bf95412 100644
--- a/clang/lib/Driver/ToolChains/Gnu.h
+++ b/clang/lib/Driver/ToolChains/Gnu.h
@@ -356,6 +356,12 @@ class LLVM_LIBRARY_VISIBILITY Generic_ELF : public Generic_GCC {
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                              llvm::opt::ArgStringList &CC1Args,
                              Action::OffloadKind DeviceOffloadKind) const override;
+
+  virtual std::string getDynamicLinker(const llvm::opt::ArgList &Args) const {
+    return {};
+  }
+
+  virtual void addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {}
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/Hurd.cpp b/clang/lib/Driver/ToolChains/Hurd.cpp
index 72286bd09f1350..ce1806c4043b15 100644
--- a/clang/lib/Driver/ToolChains/Hurd.cpp
+++ b/clang/lib/Driver/ToolChains/Hurd.cpp
@@ -61,8 +61,7 @@ static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) {
   return Triple.isArch32Bit() ? "lib" : "lib64";
 }
 
-Hurd::Hurd(const Driver &D, const llvm::Triple &Triple,
-           const ArgList &Args)
+Hurd::Hurd(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     : Generic_ELF(D, Triple, Args) {
   std::string SysRoot = computeSysRoot();
   path_list &Paths = getFilePaths();
@@ -170,3 +169,8 @@ void Hurd::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
 
   addExternCSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/include");
 }
+
+void Hurd::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
+  for (const auto &Opt : ExtraOpts)
+    CmdArgs.push_back(Opt.c_str());
+}
diff --git a/clang/lib/Driver/ToolChains/Hurd.h b/clang/lib/Driver/ToolChains/Hurd.h
index 86c6c3f734dd91..8f88d7e8e58e4e 100644
--- a/clang/lib/Driver/ToolChains/Hurd.h
+++ b/clang/lib/Driver/ToolChains/Hurd.h
@@ -27,9 +27,11 @@ class LLVM_LIBRARY_VISIBILITY Hurd : public Generic_ELF {
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
 
-  virtual std::string computeSysRoot() const;
+  std::string computeSysRoot() const;
 
-  virtual std::string getDynamicLinker(const llvm::opt::ArgList &Args) const;
+  std::string getDynamicLinker(const llvm::opt::ArgList &Args) const override;
+
+  void addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const override;
 
   std::vector<std::string> ExtraOpts;
 
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index d8d8a8da8fca2f..3d76e680114990 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -1007,3 +1007,8 @@ llvm::DenormalMode Linux::getDefaultDenormalModeForType(
     return llvm::DenormalMode::getIEEE();
   }
 }
+
+void Linux::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
+  for (const auto &Opt : ExtraOpts)
+    CmdArgs.push_back(Opt.c_str());
+}
diff --git a/clang/lib/Driver/ToolChains/Linux.h b/clang/lib/Driver/ToolChains/Linux.h
index e3c0103ac3e5d8..999f991b636074 100644
--- a/clang/lib/Driver/ToolChains/Linux.h
+++ b/clang/lib/Driver/ToolChains/Linux.h
@@ -42,7 +42,9 @@ class LLVM_LIBRARY_VISIBILITY Linux : public Generic_ELF {
                         llvm::opt::ArgStringList &CmdArgs) const override;
   virtual std::string computeSysRoot() const;
 
-  virtual std::string getDynamicLinker(const llvm::opt::ArgList &Args) const;
+  std::string getDynamicLinker(const llvm::opt::ArgList &Args) const override;
+
+  void addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const override;
 
   std::vector<std::string> ExtraOpts;
 
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index ac117840ea3358..d0d08e470e6c19 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -106,7 +106,7 @@ namespace format {
   TYPE(CSharpNullable)                                                         \
   TYPE(CSharpNullCoalescing)                                                   \
   TYPE(CSharpNullConditional)                                                  \
-  TYPE(CSharpNullConditionalSq)                                                \
+  TYPE(CSharpNullConditionalLSquare)                                           \
   TYPE(Unknown)
 
 enum TokenType {
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index da73361ee3d5b8..8fa78b773e5eb8 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -345,7 +345,7 @@ bool FormatTokenLexer::tryMergeCSharpNullConditional() {
 
   if (PeriodOrLSquare->is(tok::l_square)) {
     Question->Tok.setKind(tok::question); // no '?[' in clang tokens.
-    Question->Type = TT_CSharpNullConditionalSq;
+    Question->Type = TT_CSharpNullConditionalLSquare;
   } else {
     Question->Tok.setKind(tok::question); // no '?.' in clang tokens.
     Question->Type = TT_CSharpNullConditional;
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index e1e08686ac44e8..35e0b423cfc492 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -972,7 +972,7 @@ class AnnotatingParser {
       }
       break;
     case tok::question:
-      if (Tok->is(TT_CSharpNullConditionalSq)) {
+      if (Tok->is(TT_CSharpNullConditionalLSquare)) {
         if (!parseSquare())
           return false;
         break;
@@ -1456,7 +1456,7 @@ class AnnotatingParser {
         return;
       }
       if (CurrentToken->TokenText == "?[") {
-        Current.Type = TT_CSharpNullConditionalSq;
+        Current.Type = TT_CSharpNullConditionalLSquare;
         return;
       }
     }
@@ -2947,11 +2947,11 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
       return true;
 
     // No space before '?['.
-    if (Right.is(TT_CSharpNullConditionalSq))
+    if (Right.is(TT_CSharpNullConditionalLSquare))
       return false;
 
     // Possible space inside `?[ 0 ]`.
-    if (Left.is(TT_CSharpNullConditionalSq))
+    if (Left.is(TT_CSharpNullConditionalLSquare))
       return Style.SpacesInSquareBrackets;
 
     // space between keywords and paren e.g. "using ("
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 9cc41c9d96f898..8638d4300b218d 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2263,7 +2263,7 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
       if (T.isPS4())
         LangStd = LangStandard::lang_gnu99;
       else
-        LangStd = LangStandard::lang_gnu11;
+        LangStd = LangStandard::lang_gnu17;
 #endif
       break;
     case Language::ObjC:
diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h
index 3210f93cc85127..6ac9f92d23a265 100644
--- a/clang/lib/Headers/opencl-c.h
+++ b/clang/lib/Headers/opencl-c.h
@@ -13432,18 +13432,12 @@ int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, m
 uint __ovld atomic_fetch_min(volatile atomic_uint *object, uint operand);
 uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order);
 uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_min(volatile atomic_uint *object, int operand);
-uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order);
-uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
 int __ovld atomic_fetch_max(volatile atomic_int *object, int operand);
 int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order);
 int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
 uint __ovld atomic_fetch_max(volatile atomic_uint *object, uint operand);
 uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order);
 uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_max(volatile atomic_uint *object, int operand);
-uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order);
-uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
 
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 long __ovld atomic_fetch_add(volatile atomic_long *object, long operand);
@@ -13482,18 +13476,12 @@ long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand
 ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, ulong operand);
 ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
 ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, long operand);
-ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
 long __ovld atomic_fetch_max(volatile atomic_long *object, long operand);
 long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order);
 long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
 ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, ulong operand);
 ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
 ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, long operand);
-ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 
 // OpenCL v2.0 s6.13.11.7.5:
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 8c822ec1d0e9a7..56e88d15f8faaf 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -1842,6 +1842,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
   case OMPD_taskwait:
   case OMPD_taskgroup:
   case OMPD_flush:
+  case OMPD_depobj:
   case OMPD_for:
   case OMPD_for_simd:
   case OMPD_sections:
@@ -2064,6 +2065,7 @@ Parser::ParseOpenMPDeclarativeOrExecutableDirective(ParsedStmtContext StmtCtx) {
     break;
   }
   case OMPD_flush:
+  case OMPD_depobj:
   case OMPD_taskyield:
   case OMPD_barrier:
   case OMPD_taskwait:
@@ -2123,10 +2125,13 @@ Parser::ParseOpenMPDeclarativeOrExecutableDirective(ParsedStmtContext StmtCtx) {
   case OMPD_target_teams_distribute_parallel_for:
   case OMPD_target_teams_distribute_parallel_for_simd:
   case OMPD_target_teams_distribute_simd: {
-    // Special processing for flush clause.
-    Token FlushTok;
-    if (DKind == OMPD_flush)
-      FlushTok = Tok;
+    // Special processing for flush and depobj clauses.
+    Token ImplicitTok;
+    bool ImplicitClauseAllowed = false;
+    if (DKind == OMPD_flush || DKind == OMPD_depobj) {
+      ImplicitTok = Tok;
+      ImplicitClauseAllowed = true;
+    }
     ConsumeToken();
     // Parse directive name of the 'critical' directive if any.
     if (DKind == OMPD_critical) {
@@ -2156,22 +2161,32 @@ Parser::ParseOpenMPDeclarativeOrExecutableDirective(ParsedStmtContext StmtCtx) {
     Actions.StartOpenMPDSABlock(DKind, DirName, Actions.getCurScope(), Loc);
 
     while (Tok.isNot(tok::annot_pragma_openmp_end)) {
-      bool FlushHasClause = false;
-      if (DKind == OMPD_flush && Tok.is(tok::l_paren)) {
-        FlushHasClause = true;
+      bool HasImplicitClause = false;
+      if (ImplicitClauseAllowed && Tok.is(tok::l_paren)) {
+        HasImplicitClause = true;
         // Push copy of the current token back to stream to properly parse
-        // pseudo-clause OMPFlushClause.
+        // pseudo-clause OMPFlushClause or OMPDepobjClause.
         PP.EnterToken(Tok, /*IsReinject*/ true);
-        PP.EnterToken(FlushTok, /*IsReinject*/ true);
+        PP.EnterToken(ImplicitTok, /*IsReinject*/ true);
         ConsumeAnyToken();
       }
-      OpenMPClauseKind CKind =
-          Tok.isAnnotation()
-              ? OMPC_unknown
-              : FlushHasClause ? OMPC_flush
-                               : getOpenMPClauseKind(PP.getSpelling(Tok));
+      OpenMPClauseKind CKind = Tok.isAnnotation()
+                                   ? OMPC_unknown
+                                   : getOpenMPClauseKind(PP.getSpelling(Tok));
+      if (HasImplicitClause) {
+        assert(CKind == OMPC_unknown && "Must be unknown implicit clause.");
+        if (DKind == OMPD_flush) {
+          CKind = OMPC_flush;
+        } else {
+          assert(DKind == OMPD_depobj &&
+                 "Expected flush or depobj directives.");
+          CKind = OMPC_depobj;
+        }
+      }
+      // No more implicit clauses allowed.
+      ImplicitClauseAllowed = false;
       Actions.StartOpenMPClause(CKind);
-      FlushHasClause = false;
+      HasImplicitClause = false;
       OMPClause *Clause =
           ParseOpenMPClause(DKind, CKind, !FirstClauses[CKind].getInt());
       FirstClauses[CKind].setInt(true);
@@ -2324,7 +2339,8 @@ bool Parser::ParseOpenMPSimpleVarList(
 ///       nogroup-clause | num_tasks-clause | hint-clause | to-clause |
 ///       from-clause | is_device_ptr-clause | task_reduction-clause |
 ///       in_reduction-clause | allocator-clause | allocate-clause |
-///       acq_rel-clause | acquire-clause | release-clause | relaxed-clause
+///       acq_rel-clause | acquire-clause | release-clause | relaxed-clause |
+///       depobj-clause | destroy-clause
 ///
 OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
                                      OpenMPClauseKind CKind, bool FirstClause) {
@@ -2355,6 +2371,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_num_tasks:
   case OMPC_hint:
   case OMPC_allocator:
+  case OMPC_depobj:
     // OpenMP [2.5, Restrictions]
     //  At most one num_threads clause can appear on the directive.
     // OpenMP [2.8.1, simd construct, Restrictions]
@@ -2444,6 +2461,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_unified_shared_memory:
   case OMPC_reverse_offload:
   case OMPC_dynamic_allocators:
+  case OMPC_destroy:
     // OpenMP [2.7.1, Restrictions, p. 9]
     //  Only one ordered clause can appear on a loop directive.
     // OpenMP [2.7.1, Restrictions, C/C++, p. 4]
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index a162ff091efd13..04611dadde6614 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -1148,11 +1148,6 @@ namespace {
           continue;
         }
 
-        if (isFollowedByFallThroughComment(LastStmt)) {
-          ++AnnotatedCnt;
-          continue; // Fallthrough comment, good.
-        }
-
         ++UnannotatedCnt;
       }
       return !!UnannotatedCnt;
@@ -1213,41 +1208,10 @@ namespace {
       return nullptr;
     }
 
-    bool isFollowedByFallThroughComment(const Stmt *Statement) {
-      // Try to detect whether the fallthough is marked by a comment like
-      // /*FALLTHOUGH*/.
-      bool Invalid;
-      const char *SourceData = S.getSourceManager().getCharacterData(
-          Statement->getEndLoc(), &Invalid);
-      if (Invalid)
-        return false;
-      const char *LineStart = SourceData;
-      for (;;) {
-        LineStart = strchr(LineStart, '\n');
-        if (LineStart == nullptr)
-          return false;
-        ++LineStart; // Start of next line.
-        const char *LineEnd = strchr(LineStart, '\n');
-        StringRef Line(LineStart,
-                       LineEnd ? LineEnd - LineStart : strlen(LineStart));
-        if (LineStart == LineEnd ||
-            Line.find_first_not_of(" \t\r") == StringRef::npos)
-          continue; // Whitespace-only line.
-        if (!FallthroughRegex.isValid())
-          FallthroughRegex =
-              llvm::Regex("(/\\*[ \\t]*fall(s | |-)?thr(ough|u)\\.?[ \\t]*\\*/)"
-                          "|(//[ \\t]*fall(s | |-)?thr(ough|u)\\.?[ \\t]*)",
-                          llvm::Regex::IgnoreCase);
-        assert(FallthroughRegex.isValid());
-        return FallthroughRegex.match(Line);
-      }
-    }
-
     bool FoundSwitchStatements;
     AttrStmts FallthroughStmts;
     Sema &S;
     llvm::SmallPtrSet<const CFGBlock *, 16> ReachableBlocks;
-    llvm::Regex FallthroughRegex;
   };
 } // anonymous namespace
 
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2a66303d6d9a32..cda6910364e58f 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -11587,7 +11587,16 @@ static void AnalyzeImplicitConversions(Sema &S, Expr *OrigE, SourceLocation CC,
   if (E->isTypeDependent() || E->isValueDependent())
     return;
 
-  if (const auto *UO = dyn_cast<UnaryOperator>(E))
+  Expr *SourceExpr = E;
+  // Examine, but don't traverse into the source expression of an
+  // OpaqueValueExpr, since it may have multiple parents and we don't want to
+  // emit duplicate diagnostics. Its fine to examine the form or attempt to
+  // evaluate it in the context of checking the specific conversion to T though.
+  if (auto *OVE = dyn_cast<OpaqueValueExpr>(E))
+    if (auto *Src = OVE->getSourceExpr())
+      SourceExpr = Src;
+
+  if (const auto *UO = dyn_cast<UnaryOperator>(SourceExpr))
     if (UO->getOpcode() == UO_Not &&
         UO->getSubExpr()->isKnownToHaveBooleanValue())
       S.Diag(UO->getBeginLoc(), diag::warn_bitwise_negation_bool)
@@ -11596,21 +11605,20 @@ static void AnalyzeImplicitConversions(Sema &S, Expr *OrigE, SourceLocation CC,
 
   // For conditional operators, we analyze the arguments as if they
   // were being fed directly into the output.
-  if (isa<ConditionalOperator>(E)) {
-    ConditionalOperator *CO = cast<ConditionalOperator>(E);
+  if (auto *CO = dyn_cast<ConditionalOperator>(SourceExpr)) {
     CheckConditionalOperator(S, CO, CC, T);
     return;
   }
 
   // Check implicit argument conversions for function calls.
-  if (CallExpr *Call = dyn_cast<CallExpr>(E))
+  if (CallExpr *Call = dyn_cast<CallExpr>(SourceExpr))
     CheckImplicitArgumentConversions(S, Call, CC);
 
   // Go ahead and check any implicit conversions we might have skipped.
   // The non-canonical typecheck is just an optimization;
   // CheckImplicitConversion will filter out dead implicit conversions.
-  if (E->getType() != T)
-    CheckImplicitConversion(S, E, T, CC, nullptr, IsListInit);
+  if (SourceExpr->getType() != T)
+    CheckImplicitConversion(S, SourceExpr, T, CC, nullptr, IsListInit);
 
   // Now continue drilling into this expression.
 
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 193eaa3e01f936..1e892aa622df76 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1430,6 +1430,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OMPDistributeParallelForSimdDirectiveClass:
   case Stmt::OMPDistributeSimdDirectiveClass:
   case Stmt::OMPFlushDirectiveClass:
+  case Stmt::OMPDepobjDirectiveClass:
   case Stmt::OMPForDirectiveClass:
   case Stmt::OMPForSimdDirectiveClass:
   case Stmt::OMPMasterDirectiveClass:
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index de732577c81b0c..ecabb3aefd20c4 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -266,6 +266,8 @@ class DSAStackTy {
   SmallVector<const OMPRequiresDecl *, 2> RequiresDecls;
   /// omp_allocator_handle_t type.
   QualType OMPAllocatorHandleT;
+  /// omp_depend_t type.
+  QualType OMPDependT;
   /// Expression for the predefined allocators.
   Expr *OMPPredefinedAllocators[OMPAllocateDeclAttr::OMPUserDefinedMemAlloc] = {
       nullptr};
@@ -289,6 +291,10 @@ class DSAStackTy {
   Expr *getAllocator(OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind) const {
     return OMPPredefinedAllocators[AllocatorKind];
   }
+  /// Sets omp_depend_t type.
+  void setOMPDependT(QualType Ty) { OMPDependT = Ty; }
+  /// Gets omp_depend_t type.
+  QualType getOMPDependT() const { return OMPDependT; }
 
   bool isClauseParsingMode() const { return ClauseKindMode != OMPC_unknown; }
   OpenMPClauseKind getClauseParsingMode() const {
@@ -3740,6 +3746,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
   case OMPD_cancellation_point:
   case OMPD_cancel:
   case OMPD_flush:
+  case OMPD_depobj:
   case OMPD_declare_reduction:
   case OMPD_declare_mapper:
   case OMPD_declare_simd:
@@ -4746,6 +4753,11 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
            "No associated statement allowed for 'omp flush' directive");
     Res = ActOnOpenMPFlushDirective(ClausesWithImplicit, StartLoc, EndLoc);
     break;
+  case OMPD_depobj:
+    assert(AStmt == nullptr &&
+           "No associated statement allowed for 'omp depobj' directive");
+    Res = ActOnOpenMPDepobjDirective(ClausesWithImplicit, StartLoc, EndLoc);
+    break;
   case OMPD_ordered:
     Res = ActOnOpenMPOrderedDirective(ClausesWithImplicit, AStmt, StartLoc,
                                       EndLoc);
@@ -5029,9 +5041,11 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
       case OMPC_is_device_ptr:
       case OMPC_nontemporal:
       case OMPC_order:
+      case OMPC_destroy:
         continue;
       case OMPC_allocator:
       case OMPC_flush:
+      case OMPC_depobj:
       case OMPC_threadprivate:
       case OMPC_uniform:
       case OMPC_unknown:
@@ -8597,6 +8611,28 @@ StmtResult Sema::ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
   return OMPFlushDirective::Create(Context, StartLoc, EndLoc, Clauses);
 }
 
+StmtResult Sema::ActOnOpenMPDepobjDirective(ArrayRef<OMPClause *> Clauses,
+                                            SourceLocation StartLoc,
+                                            SourceLocation EndLoc) {
+  if (Clauses.empty()) {
+    Diag(StartLoc, diag::err_omp_depobj_expected);
+    return StmtError();
+  } else if (Clauses[0]->getClauseKind() != OMPC_depobj) {
+    Diag(Clauses[0]->getBeginLoc(), diag::err_omp_depobj_expected);
+    return StmtError();
+  }
+  // Only depobj expression and another single clause is allowed.
+  if (Clauses.size() > 2) {
+    Diag(Clauses[2]->getBeginLoc(),
+         diag::err_omp_depobj_single_clause_expected);
+    return StmtError();
+  } else if (Clauses.size() < 1) {
+    Diag(Clauses[0]->getEndLoc(), diag::err_omp_depobj_single_clause_expected);
+    return StmtError();
+  }
+  return OMPDepobjDirective::Create(Context, StartLoc, EndLoc, Clauses);
+}
+
 StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
                                              Stmt *AStmt,
                                              SourceLocation StartLoc,
@@ -10890,6 +10926,9 @@ OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
   case OMPC_hint:
     Res = ActOnOpenMPHintClause(Expr, StartLoc, LParenLoc, EndLoc);
     break;
+  case OMPC_depobj:
+    Res = ActOnOpenMPDepobjClause(Expr, StartLoc, LParenLoc, EndLoc);
+    break;
   case OMPC_if:
   case OMPC_default:
   case OMPC_proc_bind:
@@ -10942,6 +10981,7 @@ OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
   case OMPC_match:
   case OMPC_nontemporal:
   case OMPC_order:
+  case OMPC_destroy:
     llvm_unreachable("Clause is not allowed.");
   }
   return Res;
@@ -11071,6 +11111,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_taskwait:
     case OMPD_cancellation_point:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_declare_reduction:
     case OMPD_declare_mapper:
     case OMPD_declare_simd:
@@ -11141,6 +11182,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_taskwait:
     case OMPD_cancellation_point:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_declare_reduction:
     case OMPD_declare_mapper:
     case OMPD_declare_simd:
@@ -11216,6 +11258,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_taskwait:
     case OMPD_cancellation_point:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_declare_reduction:
     case OMPD_declare_mapper:
     case OMPD_declare_simd:
@@ -11288,6 +11331,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_taskwait:
     case OMPD_cancellation_point:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_declare_reduction:
     case OMPD_declare_mapper:
     case OMPD_declare_simd:
@@ -11361,6 +11405,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_taskwait:
     case OMPD_cancellation_point:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_declare_reduction:
     case OMPD_declare_mapper:
     case OMPD_declare_simd:
@@ -11433,6 +11478,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_taskwait:
     case OMPD_cancellation_point:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_declare_reduction:
     case OMPD_declare_mapper:
     case OMPD_declare_simd:
@@ -11504,6 +11550,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_taskwait:
     case OMPD_cancellation_point:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_declare_reduction:
     case OMPD_declare_mapper:
     case OMPD_declare_simd:
@@ -11578,6 +11625,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_taskwait:
     case OMPD_cancellation_point:
     case OMPD_flush:
+    case OMPD_depobj:
     case OMPD_declare_reduction:
     case OMPD_declare_mapper:
     case OMPD_declare_simd:
@@ -11627,6 +11675,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
   case OMPC_threadprivate:
   case OMPC_allocate:
   case OMPC_flush:
+  case OMPC_depobj:
   case OMPC_read:
   case OMPC_write:
   case OMPC_update:
@@ -11658,6 +11707,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
   case OMPC_match:
   case OMPC_nontemporal:
   case OMPC_order:
+  case OMPC_destroy:
     llvm_unreachable("Unexpected OpenMP clause.");
   }
   return CaptureRegion;
@@ -11933,7 +11983,8 @@ static bool findOMPAllocatorHandleT(Sema &S, SourceLocation Loc,
     Stack->setAllocator(AllocatorKind, Res.get());
   }
   if (ErrorFound) {
-    S.Diag(Loc, diag::err_implied_omp_allocator_handle_t_not_found);
+    S.Diag(Loc, diag::err_omp_implied_type_not_found)
+        << "omp_allocator_handle_t";
     return false;
   }
   OMPAllocatorHandleT.addConst();
@@ -12052,6 +12103,7 @@ OMPClause *Sema::ActOnOpenMPSimpleClause(
   case OMPC_threadprivate:
   case OMPC_allocate:
   case OMPC_flush:
+  case OMPC_depobj:
   case OMPC_read:
   case OMPC_write:
   case OMPC_update:
@@ -12088,6 +12140,7 @@ OMPClause *Sema::ActOnOpenMPSimpleClause(
   case OMPC_device_type:
   case OMPC_match:
   case OMPC_nontemporal:
+  case OMPC_destroy:
     llvm_unreachable("Clause is not allowed.");
   }
   return Res;
@@ -12248,6 +12301,7 @@ OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause(
   case OMPC_threadprivate:
   case OMPC_allocate:
   case OMPC_flush:
+  case OMPC_depobj:
   case OMPC_read:
   case OMPC_write:
   case OMPC_update:
@@ -12284,6 +12338,7 @@ OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause(
   case OMPC_match:
   case OMPC_nontemporal:
   case OMPC_order:
+  case OMPC_destroy:
     llvm_unreachable("Clause is not allowed.");
   }
   return Res;
@@ -12463,6 +12518,9 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
   case OMPC_dynamic_allocators:
     Res = ActOnOpenMPDynamicAllocatorsClause(StartLoc, EndLoc);
     break;
+  case OMPC_destroy:
+    Res = ActOnOpenMPDestroyClause(StartLoc, EndLoc);
+    break;
   case OMPC_if:
   case OMPC_final:
   case OMPC_num_threads:
@@ -12487,6 +12545,7 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
   case OMPC_threadprivate:
   case OMPC_allocate:
   case OMPC_flush:
+  case OMPC_depobj:
   case OMPC_depend:
   case OMPC_device:
   case OMPC_map:
@@ -12610,6 +12669,11 @@ OMPClause *Sema::ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc,
   return new (Context) OMPDynamicAllocatorsClause(StartLoc, EndLoc);
 }
 
+OMPClause *Sema::ActOnOpenMPDestroyClause(SourceLocation StartLoc,
+                                          SourceLocation EndLoc) {
+  return new (Context) OMPDestroyClause(StartLoc, EndLoc);
+}
+
 OMPClause *Sema::ActOnOpenMPVarListClause(
     OpenMPClauseKind Kind, ArrayRef<Expr *> VarList, Expr *TailExpr,
     const OMPVarListLocTy &Locs, SourceLocation ColonLoc,
@@ -12712,6 +12776,7 @@ OMPClause *Sema::ActOnOpenMPVarListClause(
     Res = ActOnOpenMPNontemporalClause(VarList, StartLoc, LParenLoc, EndLoc);
     break;
   case OMPC_if:
+  case OMPC_depobj:
   case OMPC_final:
   case OMPC_num_threads:
   case OMPC_safelen:
@@ -12757,6 +12822,7 @@ OMPClause *Sema::ActOnOpenMPVarListClause(
   case OMPC_device_type:
   case OMPC_match:
   case OMPC_order:
+  case OMPC_destroy:
     llvm_unreachable("Clause is not allowed.");
   }
   return Res;
@@ -15120,6 +15186,49 @@ OMPClause *Sema::ActOnOpenMPFlushClause(ArrayRef<Expr *> VarList,
   return OMPFlushClause::Create(Context, StartLoc, LParenLoc, EndLoc, VarList);
 }
 
+/// Tries to find omp_depend_t. type.
+static bool findOMPDependT(Sema &S, SourceLocation Loc, DSAStackTy *Stack) {
+  QualType OMPDependT = Stack->getOMPDependT();
+  if (!OMPDependT.isNull())
+    return true;
+  IdentifierInfo *II = &S.PP.getIdentifierTable().get("omp_depend_t");
+  ParsedType PT = S.getTypeName(*II, Loc, S.getCurScope());
+  if (!PT.getAsOpaquePtr() || PT.get().isNull()) {
+    S.Diag(Loc, diag::err_omp_implied_type_not_found) << "omp_depend_t";
+    return false;
+  }
+  Stack->setOMPDependT(PT.get());
+  return true;
+}
+
+OMPClause *Sema::ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc,
+                                         SourceLocation LParenLoc,
+                                         SourceLocation EndLoc) {
+  if (!Depobj)
+    return nullptr;
+
+  bool OMPDependTFound = findOMPDependT(*this, StartLoc, DSAStack);
+
+  // OpenMP 5.0, 2.17.10.1 depobj Construct
+  // depobj is an lvalue expression of type omp_depend_t.
+  if (!Depobj->isTypeDependent() && !Depobj->isValueDependent() &&
+      !Depobj->isInstantiationDependent() &&
+      !Depobj->containsUnexpandedParameterPack() &&
+      (OMPDependTFound &&
+       !Context.typesAreCompatible(DSAStack->getOMPDependT(), Depobj->getType(),
+                                   /*CompareUnqualified=*/true))) {
+    Diag(Depobj->getExprLoc(), diag::err_omp_expected_omp_depend_t_lvalue)
+        << 0 << Depobj->getType() << Depobj->getSourceRange();
+  }
+
+  if (!Depobj->isLValue()) {
+    Diag(Depobj->getExprLoc(), diag::err_omp_expected_omp_depend_t_lvalue)
+        << 1 << Depobj->getSourceRange();
+  }
+
+  return OMPDepobjClause::Create(Context, StartLoc, LParenLoc, EndLoc, Depobj);
+}
+
 OMPClause *
 Sema::ActOnOpenMPDependClause(OpenMPDependClauseKind DepKind,
                               SourceLocation DepLoc, SourceLocation ColonLoc,
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 6ad1c61217589c..002b73c3a1dd72 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1811,6 +1811,17 @@ class TreeTransform {
                                             EndLoc);
   }
 
+  /// Build a new OpenMP 'depobj' pseudo clause.
+  ///
+  /// By default, performs semantic analysis to build the new OpenMP clause.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPDepobjClause(Expr *Depobj, SourceLocation StartLoc,
+                                    SourceLocation LParenLoc,
+                                    SourceLocation EndLoc) {
+    return getSema().ActOnOpenMPDepobjClause(Depobj, StartLoc, LParenLoc,
+                                             EndLoc);
+  }
+
   /// Build a new OpenMP 'depend' pseudo clause.
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
@@ -8263,6 +8274,17 @@ TreeTransform<Derived>::TransformOMPFlushDirective(OMPFlushDirective *D) {
   return Res;
 }
 
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOMPDepobjDirective(OMPDepobjDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(OMPD_depobj, DirName, nullptr,
+                                             D->getBeginLoc());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
 template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPOrderedDirective(OMPOrderedDirective *D) {
@@ -8851,6 +8873,13 @@ TreeTransform<Derived>::TransformOMPNogroupClause(OMPNogroupClause *C) {
   return C;
 }
 
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPDestroyClause(OMPDestroyClause *C) {
+  // No need to rebuild this clause, no template-dependent parameters.
+  return C;
+}
+
 template <typename Derived>
 OMPClause *TreeTransform<Derived>::TransformOMPUnifiedAddressClause(
     OMPUnifiedAddressClause *C) {
@@ -9164,6 +9193,16 @@ OMPClause *TreeTransform<Derived>::TransformOMPFlushClause(OMPFlushClause *C) {
                                             C->getLParenLoc(), C->getEndLoc());
 }
 
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPDepobjClause(OMPDepobjClause *C) {
+  ExprResult E = getDerived().TransformExpr(C->getDepobj());
+  if (E.isInvalid())
+    return nullptr;
+  return getDerived().RebuildOMPDepobjClause(E.get(), C->getBeginLoc(),
+                                             C->getLParenLoc(), C->getEndLoc());
+}
+
 template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPDependClause(OMPDependClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index a74ccc9ed17959..865a666ce8f445 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11737,6 +11737,9 @@ OMPClause *OMPClauseReader::readClause() {
   case OMPC_flush:
     C = OMPFlushClause::CreateEmpty(Context, Record.readInt());
     break;
+  case OMPC_depobj:
+    C = OMPDepobjClause::CreateEmpty(Context);
+    break;
   case OMPC_depend: {
     unsigned NumVars = Record.readInt();
     unsigned NumLoops = Record.readInt();
@@ -11824,6 +11827,9 @@ OMPClause *OMPClauseReader::readClause() {
   case OMPC_order:
     C = new (Context) OMPOrderClause();
     break;
+  case OMPC_destroy:
+    C = new (Context) OMPDestroyClause();
+    break;
   }
   assert(C && "Unknown OMPClause type");
 
@@ -11952,6 +11958,8 @@ void OMPClauseReader::VisitOMPSIMDClause(OMPSIMDClause *) {}
 
 void OMPClauseReader::VisitOMPNogroupClause(OMPNogroupClause *) {}
 
+void OMPClauseReader::VisitOMPDestroyClause(OMPDestroyClause *) {}
+
 void OMPClauseReader::VisitOMPUnifiedAddressClause(OMPUnifiedAddressClause *) {}
 
 void OMPClauseReader::VisitOMPUnifiedSharedMemoryClause(
@@ -12249,6 +12257,11 @@ void OMPClauseReader::VisitOMPFlushClause(OMPFlushClause *C) {
   C->setVarRefs(Vars);
 }
 
+void OMPClauseReader::VisitOMPDepobjClause(OMPDepobjClause *C) {
+  C->setDepobj(Record.readSubExpr());
+  C->setLParenLoc(Record.readSourceLocation());
+}
+
 void OMPClauseReader::VisitOMPDependClause(OMPDependClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
   C->setDependencyKind(
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 3da7a71e7f703a..fc83dc42d4d150 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2343,6 +2343,13 @@ void ASTStmtReader::VisitOMPFlushDirective(OMPFlushDirective *D) {
   VisitOMPExecutableDirective(D);
 }
 
+void ASTStmtReader::VisitOMPDepobjDirective(OMPDepobjDirective *D) {
+  VisitStmt(D);
+  // The NumClauses field was read in ReadStmtFromStream.
+  Record.skipInts(1);
+  VisitOMPExecutableDirective(D);
+}
+
 void ASTStmtReader::VisitOMPOrderedDirective(OMPOrderedDirective *D) {
   VisitStmt(D);
   // The NumClauses field was read in ReadStmtFromStream.
@@ -3174,6 +3181,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           Context, Record[ASTStmtReader::NumStmtFields], Empty);
       break;
 
+    case STMT_OMP_DEPOBJ_DIRECTIVE:
+      S = OMPDepobjDirective::CreateEmpty(
+          Context, Record[ASTStmtReader::NumStmtFields], Empty);
+      break;
+
     case STMT_OMP_ORDERED_DIRECTIVE:
       S = OMPOrderedDirective::CreateEmpty(
           Context, Record[ASTStmtReader::NumStmtFields], Empty);
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 6f77d4f5d1156b..bf59bca29e8cd3 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6161,6 +6161,8 @@ void OMPClauseWriter::VisitOMPSIMDClause(OMPSIMDClause *) {}
 
 void OMPClauseWriter::VisitOMPNogroupClause(OMPNogroupClause *) {}
 
+void OMPClauseWriter::VisitOMPDestroyClause(OMPDestroyClause *) {}
+
 void OMPClauseWriter::VisitOMPPrivateClause(OMPPrivateClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
@@ -6342,6 +6344,11 @@ void OMPClauseWriter::VisitOMPFlushClause(OMPFlushClause *C) {
     Record.AddStmt(VE);
 }
 
+void OMPClauseWriter::VisitOMPDepobjClause(OMPDepobjClause *C) {
+  Record.AddStmt(C->getDepobj());
+  Record.AddSourceLocation(C->getLParenLoc());
+}
+
 void OMPClauseWriter::VisitOMPDependClause(OMPDependClause *C) {
   Record.push_back(C->varlist_size());
   Record.push_back(C->getNumLoops());
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 9b6e869e1c3488..d2b1fc2becf108 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2314,6 +2314,13 @@ void ASTStmtWriter::VisitOMPFlushDirective(OMPFlushDirective *D) {
   Code = serialization::STMT_OMP_FLUSH_DIRECTIVE;
 }
 
+void ASTStmtWriter::VisitOMPDepobjDirective(OMPDepobjDirective *D) {
+  VisitStmt(D);
+  Record.push_back(D->getNumClauses());
+  VisitOMPExecutableDirective(D);
+  Code = serialization::STMT_OMP_DEPOBJ_DIRECTIVE;
+}
+
 void ASTStmtWriter::VisitOMPOrderedDirective(OMPOrderedDirective *D) {
   VisitStmt(D);
   Record.push_back(D->getNumClauses());
diff --git a/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
index 54b364f38f812e..10b27831d89f80 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Taint.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Checkers/SValExplainer.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
@@ -46,6 +47,7 @@ class ExprInspectionChecker : public Checker<eval::Call, check::DeadSymbols,
   void analyzerHashDump(const CallExpr *CE, CheckerContext &C) const;
   void analyzerDenote(const CallExpr *CE, CheckerContext &C) const;
   void analyzerExpress(const CallExpr *CE, CheckerContext &C) const;
+  void analyzerIsTainted(const CallExpr *CE, CheckerContext &C) const;
 
   typedef void (ExprInspectionChecker::*FnCheck)(const CallExpr *,
                                                  CheckerContext &C) const;
@@ -73,26 +75,34 @@ bool ExprInspectionChecker::evalCall(const CallEvent &Call,
 
   // These checks should have no effect on the surrounding environment
   // (globals should not be invalidated, etc), hence the use of evalCall.
-  FnCheck Handler = llvm::StringSwitch<FnCheck>(C.getCalleeName(CE))
-    .Case("clang_analyzer_eval", &ExprInspectionChecker::analyzerEval)
-    .Case("clang_analyzer_checkInlined",
-          &ExprInspectionChecker::analyzerCheckInlined)
-    .Case("clang_analyzer_crash", &ExprInspectionChecker::analyzerCrash)
-    .Case("clang_analyzer_warnIfReached",
-          &ExprInspectionChecker::analyzerWarnIfReached)
-    .Case("clang_analyzer_warnOnDeadSymbol",
-          &ExprInspectionChecker::analyzerWarnOnDeadSymbol)
-    .StartsWith("clang_analyzer_explain", &ExprInspectionChecker::analyzerExplain)
-    .StartsWith("clang_analyzer_dump", &ExprInspectionChecker::analyzerDump)
-    .Case("clang_analyzer_getExtent", &ExprInspectionChecker::analyzerGetExtent)
-    .Case("clang_analyzer_printState",
-          &ExprInspectionChecker::analyzerPrintState)
-    .Case("clang_analyzer_numTimesReached",
-          &ExprInspectionChecker::analyzerNumTimesReached)
-    .Case("clang_analyzer_hashDump", &ExprInspectionChecker::analyzerHashDump)
-    .Case("clang_analyzer_denote", &ExprInspectionChecker::analyzerDenote)
-    .Case("clang_analyzer_express", &ExprInspectionChecker::analyzerExpress)
-    .Default(nullptr);
+  FnCheck Handler =
+      llvm::StringSwitch<FnCheck>(C.getCalleeName(CE))
+          .Case("clang_analyzer_eval", &ExprInspectionChecker::analyzerEval)
+          .Case("clang_analyzer_checkInlined",
+                &ExprInspectionChecker::analyzerCheckInlined)
+          .Case("clang_analyzer_crash", &ExprInspectionChecker::analyzerCrash)
+          .Case("clang_analyzer_warnIfReached",
+                &ExprInspectionChecker::analyzerWarnIfReached)
+          .Case("clang_analyzer_warnOnDeadSymbol",
+                &ExprInspectionChecker::analyzerWarnOnDeadSymbol)
+          .StartsWith("clang_analyzer_explain",
+                      &ExprInspectionChecker::analyzerExplain)
+          .StartsWith("clang_analyzer_dump",
+                      &ExprInspectionChecker::analyzerDump)
+          .Case("clang_analyzer_getExtent",
+                &ExprInspectionChecker::analyzerGetExtent)
+          .Case("clang_analyzer_printState",
+                &ExprInspectionChecker::analyzerPrintState)
+          .Case("clang_analyzer_numTimesReached",
+                &ExprInspectionChecker::analyzerNumTimesReached)
+          .Case("clang_analyzer_hashDump",
+                &ExprInspectionChecker::analyzerHashDump)
+          .Case("clang_analyzer_denote", &ExprInspectionChecker::analyzerDenote)
+          .Case("clang_analyzer_express",
+                &ExprInspectionChecker::analyzerExpress)
+          .StartsWith("clang_analyzer_isTainted",
+                      &ExprInspectionChecker::analyzerIsTainted)
+          .Default(nullptr);
 
   if (!Handler)
     return false;
@@ -412,6 +422,17 @@ void ExprInspectionChecker::analyzerExpress(const CallExpr *CE,
   reportBug(*Str, C);
 }
 
+void ExprInspectionChecker::analyzerIsTainted(const CallExpr *CE,
+                                              CheckerContext &C) const {
+  if (CE->getNumArgs() != 1) {
+    reportBug("clang_analyzer_isTainted() requires exactly one argument", C);
+    return;
+  }
+  const bool IsTainted =
+      taint::isTainted(C.getState(), CE->getArg(0), C.getLocationContext());
+  reportBug(IsTainted ? "YES" : "NO", C);
+}
+
 void ento::registerExprInspectionChecker(CheckerManager &Mgr) {
   Mgr.registerChecker<ExprInspectionChecker>();
 }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 801b30a9ab6c6c..1b13c49713ba1d 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1257,6 +1257,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OMPTaskwaitDirectiveClass:
     case Stmt::OMPTaskgroupDirectiveClass:
     case Stmt::OMPFlushDirectiveClass:
+    case Stmt::OMPDepobjDirectiveClass:
     case Stmt::OMPOrderedDirectiveClass:
     case Stmt::OMPAtomicDirectiveClass:
     case Stmt::OMPTargetDirectiveClass:
diff --git a/clang/lib/Tooling/Syntax/Tokens.cpp b/clang/lib/Tooling/Syntax/Tokens.cpp
index ae5bc687553b7f..9e12d8b603bfe4 100644
--- a/clang/lib/Tooling/Syntax/Tokens.cpp
+++ b/clang/lib/Tooling/Syntax/Tokens.cpp
@@ -183,6 +183,16 @@ llvm::ArrayRef<syntax::Token> TokenBuffer::spelledTokens(FileID FID) const {
   return It->second.SpelledTokens;
 }
 
+const syntax::Token *TokenBuffer::spelledTokenAt(SourceLocation Loc) const {
+  assert(Loc.isFileID());
+  const auto *Tok = llvm::partition_point(
+      spelledTokens(SourceMgr->getFileID(Loc)),
+      [&](const syntax::Token &Tok) { return Tok.location() < Loc; });
+  if (!Tok || Tok->location() != Loc)
+    return nullptr;
+  return Tok;
+}
+
 std::string TokenBuffer::Mapping::str() const {
   return std::string(
       llvm::formatv("spelled tokens: [{0},{1}), expanded tokens: [{2},{3})",
diff --git a/clang/test/Analysis/debug-exprinspection-istainted.c b/clang/test/Analysis/debug-exprinspection-istainted.c
new file mode 100644
index 00000000000000..e2f6821e4aa9ab
--- /dev/null
+++ b/clang/test/Analysis/debug-exprinspection-istainted.c
@@ -0,0 +1,27 @@
+// RUN: %clang_analyze_cc1 -verify %s \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN:   -analyzer-checker=alpha.security.taint
+
+int scanf(const char *restrict format, ...);
+void clang_analyzer_isTainted(char);
+void clang_analyzer_isTainted_any_suffix(char);
+void clang_analyzer_isTainted_many_arguments(char, int, int);
+
+void foo() {
+  char buf[32] = "";
+  clang_analyzer_isTainted(buf[0]);            // expected-warning {{NO}}
+  clang_analyzer_isTainted_any_suffix(buf[0]); // expected-warning {{NO}}
+  scanf("%s", buf);
+  clang_analyzer_isTainted(buf[0]);            // expected-warning {{YES}}
+  clang_analyzer_isTainted_any_suffix(buf[0]); // expected-warning {{YES}}
+
+  int tainted_value = buf[0]; // no-warning
+}
+
+void exactly_one_argument_required() {
+  char buf[32] = "";
+  scanf("%s", buf);
+  clang_analyzer_isTainted_many_arguments(buf[0], 42, 42);
+  // expected-warning@-1 {{clang_analyzer_isTainted() requires exactly one argument}}
+}
diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt
index 2c6487e8c26082..7fdc7d0be79f33 100644
--- a/clang/test/CMakeLists.txt
+++ b/clang/test/CMakeLists.txt
@@ -9,6 +9,15 @@ endif ()
 
 string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} CLANG_TOOLS_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR})
 
+if(CLANG_BUILT_STANDALONE)
+  # Set HAVE_LIBZ according to recorded LLVM_ENABLE_ZLIB value. This
+  # value is forced to 0 if zlib was not found, so it is fine to use it
+  # instead of HAVE_LIBZ (not recorded).
+  if(LLVM_ENABLE_ZLIB)
+    set(HAVE_LIBZ 1)
+  endif()
+endif()
+
 llvm_canonicalize_cmake_booleans(
   CLANG_BUILD_EXAMPLES
   CLANG_ENABLE_ARCMT
@@ -16,7 +25,7 @@ llvm_canonicalize_cmake_booleans(
   CLANG_SPAWN_CC1
   ENABLE_BACKTRACES
   ENABLE_EXPERIMENTAL_NEW_PASS_MANAGER
-  LLVM_ENABLE_ZLIB
+  HAVE_LIBZ
   LLVM_ENABLE_PER_TARGET_RUNTIME_DIR
   LLVM_ENABLE_PLUGINS
   LLVM_ENABLE_THREADS)
diff --git a/clang/test/CodeGenCXX/debug-info-template-parameter.cpp b/clang/test/CodeGenCXX/debug-info-template-parameter.cpp
new file mode 100644
index 00000000000000..95e7a187fe1026
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-template-parameter.cpp
@@ -0,0 +1,29 @@
+// Test for DebugInfo for Defaulted parameters for C++ templates
+// Supported: -O0, standalone DI
+
+// RUN: %clang_cc1 -dwarf-version=5  -emit-llvm -triple x86_64-linux-gnu %s -o - \
+// RUN:   -O0 -disable-llvm-passes \
+// RUN:   -debug-info-kind=standalone \
+// RUN: | FileCheck %s
+
+// CHECK: DILocalVariable(name: "f1", {{.*}}, type: ![[TEMPLATE_TYPE:[0-9]+]]
+// CHECK: [[TEMPLATE_TYPE]] = {{.*}}!DICompositeType({{.*}}, templateParams: ![[F1_TYPE:[0-9]+]]
+// CHECK: [[F1_TYPE]] = !{![[FIRST:[0-9]+]], ![[SECOND:[0-9]+]]}
+// CHECK: [[FIRST]] = !DITemplateTypeParameter(name: "T", type: !{{[0-9]*}})
+// CHECK: [[SECOND]] = !DITemplateValueParameter(name: "i", type: !{{[0-9]*}}, value: i32 6)
+
+// CHECK: DILocalVariable(name: "f2", {{.*}}, type: ![[TEMPLATE_TYPE:[0-9]+]]
+// CHECK: [[TEMPLATE_TYPE]] = {{.*}}!DICompositeType({{.*}}, templateParams: ![[F2_TYPE:[0-9]+]]
+// CHECK: [[F2_TYPE]] = !{![[FIRST:[0-9]+]], ![[SECOND:[0-9]+]]}
+// CHECK: [[FIRST]] = !DITemplateTypeParameter(name: "T", type: !{{[0-9]*}}, defaulted: true)
+// CHECK: [[SECOND]] = !DITemplateValueParameter(name: "i", type: !{{[0-9]*}}, defaulted: true, value: i32 3)
+
+template <typename T = char, int i = 3>
+class foo {
+};
+
+int main() {
+  foo<int, 6> f1;
+  foo<> f2;
+  return 0;
+}
diff --git a/clang/test/OpenMP/allocate_allocator_messages.cpp b/clang/test/OpenMP/allocate_allocator_messages.cpp
index 0c4d36fc5f5695..3ab735acedb918 100644
--- a/clang/test/OpenMP/allocate_allocator_messages.cpp
+++ b/clang/test/OpenMP/allocate_allocator_messages.cpp
@@ -10,10 +10,10 @@ int sss;
 #pragma omp allocate(sss) allocat // expected-warning {{extra tokens at the end of '#pragma omp allocate' are ignored}}
 #pragma omp allocate(sss) allocate(sss) // expected-error {{unexpected OpenMP clause 'allocate' in directive '#pragma omp allocate'}}
 #pragma omp allocate(sss) allocator // expected-error {{expected '(' after 'allocator'}}
-#pragma omp allocate(sss) allocator(0,  // expected-error {{expected ')'}} expected-error {{omp_allocator_handle_t type not found; include <omp.h>}} expected-note {{to match this '('}}
-#pragma omp allocate(sss) allocator(0,sss  // expected-error {{expected ')'}} expected-error {{omp_allocator_handle_t type not found; include <omp.h>}} expected-note {{to match this '('}}
-#pragma omp allocate(sss) allocator(0,sss)  // expected-error {{expected ')'}} expected-error {{omp_allocator_handle_t type not found; include <omp.h>}} expected-note {{to match this '('}}
-#pragma omp allocate(sss) allocator(sss)  // expected-error {{omp_allocator_handle_t type not found; include <omp.h>}}
+#pragma omp allocate(sss) allocator(0,  // expected-error {{expected ')'}} expected-error {{'omp_allocator_handle_t' type not found; include <omp.h>}} expected-note {{to match this '('}}
+#pragma omp allocate(sss) allocator(0,sss  // expected-error {{expected ')'}} expected-error {{'omp_allocator_handle_t' type not found; include <omp.h>}} expected-note {{to match this '('}}
+#pragma omp allocate(sss) allocator(0,sss)  // expected-error {{expected ')'}} expected-error {{'omp_allocator_handle_t' type not found; include <omp.h>}} expected-note {{to match this '('}}
+#pragma omp allocate(sss) allocator(sss)  // expected-error {{'omp_allocator_handle_t' type not found; include <omp.h>}}
 
 typedef void **omp_allocator_handle_t;
 extern const omp_allocator_handle_t omp_default_mem_alloc;
diff --git a/clang/test/OpenMP/depobj_ast_print.cpp b/clang/test/OpenMP/depobj_ast_print.cpp
new file mode 100644
index 00000000000000..9d1d408c058c0e
--- /dev/null
+++ b/clang/test/OpenMP/depobj_ast_print.cpp
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+typedef void *omp_depend_t;
+
+void foo() {}
+
+template <class T>
+T tmain(T argc) {
+  static T a;
+#pragma omp depobj(a) depend(in:argc)
+#pragma omp depobj(argc) destroy
+  return argc;
+}
+// CHECK:      static T a;
+// CHECK-NEXT: #pragma omp depobj (a) depend(in : argc){{$}}
+// CHECK-NEXT: #pragma omp depobj (argc) destroy{{$}}
+// CHECK:      static void *a;
+// CHECK-NEXT: #pragma omp depobj (a) depend(in : argc){{$}}
+// CHECK-NEXT: #pragma omp depobj (argc) destroy{{$}}
+
+int main(int argc, char **argv) {
+  static omp_depend_t a;
+  omp_depend_t b;
+// CHECK: static omp_depend_t a;
+// CHECK-NEXT: omp_depend_t b;
+#pragma omp depobj(a) depend(out:argc, argv)
+#pragma omp depobj(b) destroy
+// CHECK-NEXT: #pragma omp depobj (a) depend(out : argc,argv)
+// CHECK-NEXT: #pragma omp depobj (b) destroy
+  (void)tmain(a), tmain(b);
+  return 0;
+}
+
+#endif
diff --git a/clang/test/OpenMP/depobj_messages.cpp b/clang/test/OpenMP/depobj_messages.cpp
new file mode 100644
index 00000000000000..b820a0eb517d6d
--- /dev/null
+++ b/clang/test/OpenMP/depobj_messages.cpp
@@ -0,0 +1,156 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized
+
+struct S1 { // expected-note 2 {{declared here}}
+  int a;
+} s;
+
+#pragma omp depobj(0) depend(in:s) // expected-error {{unexpected OpenMP directive '#pragma omp depobj'}}
+void foo() {
+#pragma omp depobj(0) depend(in:s) // expected-error {{'omp_depend_t' type not found; include <omp.h>}} expected-error {{expected lvalue expression}}}
+}
+
+typedef void *omp_depend_t;
+
+template <class T>
+T tmain(T argc) {
+  omp_depend_t x;
+#pragma omp depobj() allocate(argc) // expected-error {{expected expression}} expected-error {{expected depobj expression}} expected-error {{unexpected OpenMP clause 'allocate' in directive '#pragma omp depobj'}}
+  ;
+#pragma omp depobj(x) untied  // expected-error {{unexpected OpenMP clause 'untied' in directive '#pragma omp depobj'}}
+#pragma omp depobj(x) unknown // expected-warning {{extra tokens at the end of '#pragma omp depobj' are ignored}}
+  if (argc)
+#pragma omp depobj(x) destroy // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    if (argc) {
+#pragma omp depobj(x) depend(in:s)
+    }
+  while (argc)
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    while (argc) {
+#pragma omp depobj(x) depend(in:s)
+    }
+  do
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    while (argc)
+      ;
+  do {
+#pragma omp depobj(x) depend(in:s)
+  } while (argc);
+  switch (argc)
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    switch (argc)
+    case 1:
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+  switch (argc)
+  case 1: {
+#pragma omp depobj(x) depend(in:s)
+  }
+  switch (argc) {
+#pragma omp depobj(x) depend(in:s)
+  case 1:
+#pragma omp depobj(x) depend(in:s)
+    break;
+  default: {
+#pragma omp depobj(x) depend(in:s)
+  } break;
+  }
+  for (;;)
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    for (;;) {
+#pragma omp depobj(x) depend(in:s)
+    }
+label:
+#pragma omp depobj(x) depend(in:s)
+label1 : {
+#pragma omp depobj(x) depend(in:s)
+}
+
+#pragma omp depobj                               // expected-error {{expected depobj expression}}
+#pragma omp depobj(                              // expected-error {{expected expression}} expected-error {{expected depobj expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp depobj()                             // expected-error {{expected expression}} expected-error {{expected depobj expression}}
+#pragma omp depobj(argc                          // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}}}
+#pragma omp depobj(argc,                         // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}}
+#pragma omp depobj(argc)                         // expected-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}}
+#pragma omp depobj(S1) // expected-error {{'S1' does not refer to a value}} expected-error {{expected depobj expression}}
+#pragma omp depobj(argc) depobj(argc) // expected-warning {{extra tokens at the end of '#pragma omp depobj' are ignored}} expected-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}}}
+#pragma omp parallel depobj(argc) // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
+  ;
+  return T();
+}
+
+int main(int argc, char **argv) {
+omp_depend_t x;
+#pragma omp depobj(x) depend(in:s)
+  ;
+#pragma omp depobj(x) untied  // expected-error {{unexpected OpenMP clause 'untied' in directive '#pragma omp depobj'}}
+#pragma omp depobj(x) unknown // expected-warning {{extra tokens at the end of '#pragma omp depobj' are ignored}}
+  if (argc)
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    if (argc) {
+#pragma omp depobj(x) depend(in:s)
+    }
+  while (argc)
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    while (argc) {
+#pragma omp depobj(x) depend(in:s)
+    }
+  do
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    while (argc)
+      ;
+  do {
+#pragma omp depobj(x) depend(in:s)
+  } while (argc);
+  switch (argc)
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    switch (argc)
+    case 1:
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+  switch (argc)
+  case 1: {
+#pragma omp depobj(x) depend(in:s)
+  }
+  switch (argc) {
+#pragma omp depobj(x) depend(in:s)
+  case 1:
+#pragma omp depobj(x) depend(in:s)
+    break;
+  default: {
+#pragma omp depobj(x) depend(in:s)
+  } break;
+  }
+  for (;;)
+#pragma omp depobj(x) depend(in:s) // expected-error {{'#pragma omp depobj' cannot be an immediate substatement}}
+    for (;;) {
+#pragma omp depobj(x) depend(in:s)
+    }
+label:
+#pragma omp depobj(x) depend(in:s)
+label1 : {
+#pragma omp depobj(x) depend(in:s)
+}
+
+#pragma omp depobj                               // expected-error {{expected depobj expression}}
+#pragma omp depobj(                              // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected depobj expression}}
+#pragma omp depobj()                             // expected-error {{expected expression}} expected-error {{expected depobj expression}}
+#pragma omp depobj(argc                          // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}}
+#pragma omp depobj(argc,                         // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}}
+#pragma omp depobj(argc)                         // expected-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}}
+#pragma omp depobj(S1) // expected-error {{'S1' does not refer to a value}} expected-error {{expected depobj expression}}
+#pragma omp depobj(argc) depobj(argc) // expected-warning {{extra tokens at the end of '#pragma omp depobj' are ignored}} expected-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}}
+#pragma omp parallel depobj(argc) // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
+  ;
+#pragma omp depobj(x) seq_cst // expected-error {{unexpected OpenMP clause 'seq_cst' in directive '#pragma omp depobj'}}
+#pragma omp depobj(x) depend(in: x)
+#pragma omp depobj(x) destroy destroy // expected-error {{directive '#pragma omp depobj' cannot contain more than one 'destroy' clause}}
+#pragma omp depobj(x) depend(in: x) destroy // expected-error {{exactly one of 'depend', 'destroy', or 'update' clauses is expected}}
+#pragma omp depobj(x) destroy depend(in: x) // expected-error {{exactly one of 'depend', 'destroy', or 'update' clauses is expected}}
+#pragma omp depobj(x) (x) depend(in: x) // expected-warning {{extra tokens at the end of '#pragma omp depobj' are ignored}}
+#pragma omp depobj(x) depend(in: x) depend(out:x) // expected-error {{exactly one of 'depend', 'destroy', or 'update' clauses is expected}}
+#pragma omp depend(out:x) depobj(x) // expected-error {{expected an OpenMP directive}}
+#pragma omp destroy depobj(x) // expected-error {{expected an OpenMP directive}}
+#pragma omp depobj depend(in:x) (x) // expected-error {{expected depobj expression}} expected-warning {{extra tokens at the end of '#pragma omp depobj' are ignored}}
+#pragma omp depobj destroy (x) // expected-error {{expected depobj expression}} expected-warning {{extra tokens at the end of '#pragma omp depobj' are ignored}}
+  return tmain(argc); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}}
+}
diff --git a/clang/test/OpenMP/flush_messages.cpp b/clang/test/OpenMP/flush_messages.cpp
index 51497249a8f386..7d20e385bfafac 100644
--- a/clang/test/OpenMP/flush_messages.cpp
+++ b/clang/test/OpenMP/flush_messages.cpp
@@ -142,7 +142,7 @@ label1 : {
 #pragma omp flush seq_cst // expected-error {{unexpected OpenMP clause 'seq_cst' in directive '#pragma omp flush'}}
 #pragma omp flush acq_rel acquire // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp50-error {{directive '#pragma omp flush' cannot contain more than one 'acq_rel', 'acquire' or 'release' clause}} omp50-note {{'acq_rel' clause used here}}
 #pragma omp flush release acquire // omp45-error {{unexpected OpenMP clause 'release' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp50-error {{directive '#pragma omp flush' cannot contain more than one 'acq_rel', 'acquire' or 'release' clause}} omp50-note {{'release' clause used here}}
-#pragma omp flush acq_rel (argc) // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} omp50-error {{'flush' directive with memory order clause 'acq_rel' cannot have the list}} omp50-note {{memory order clause 'acq_rel' is specified here}}
+#pragma omp flush acq_rel (argc) // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} expected-warning {{extra tokens at the end of '#pragma omp flush' are ignored}}
 #pragma omp flush(argc) acq_rel // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} omp50-error {{'flush' directive with memory order clause 'acq_rel' cannot have the list}} omp50-note {{memory order clause 'acq_rel' is specified here}}
   return tmain(argc);
 }
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 380e5e2d726181..df2a6128989bde 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -236,7 +236,7 @@
 // AARCH64-NEXT: #define __STDC_HOSTED__ 1
 // AARCH64-NEXT: #define __STDC_UTF_16__ 1
 // AARCH64-NEXT: #define __STDC_UTF_32__ 1
-// AARCH64_C: #define __STDC_VERSION__ 201112L
+// AARCH64_C: #define __STDC_VERSION__ 201710L
 // AARCH64-NEXT: #define __STDC__ 1
 // AARCH64-NEXT: #define __UINT16_C_SUFFIX__ 
 // AARCH64-NEXT: #define __UINT16_FMTX__ "hX"
@@ -646,7 +646,7 @@
 // AARCH64-MSVC: #define __STDC_HOSTED__ 0
 // AARCH64-MSVC: #define __STDC_UTF_16__ 1
 // AARCH64-MSVC: #define __STDC_UTF_32__ 1
-// AARCH64-MSVC: #define __STDC_VERSION__ 201112L
+// AARCH64-MSVC: #define __STDC_VERSION__ 201710L
 // AARCH64-MSVC: #define __STDC__ 1
 // AARCH64-MSVC: #define __UINT16_C_SUFFIX__
 // AARCH64-MSVC: #define __UINT16_MAX__ 65535
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index f38f87ddef8e63..e987a3b3b93d9f 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -117,7 +117,7 @@
 // RUN: %clang_cc1 -E -dM -triple=x86_64-apple-darwin < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
 // RUN: %clang_cc1 -E -dM -triple=armv7a-apple-darwin < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
 //
-// C-DEFAULT:#define __STDC_VERSION__ 201112L
+// C-DEFAULT:#define __STDC_VERSION__ 201710L
 //
 // RUN: %clang_cc1 -ffreestanding -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix FREESTANDING %s
 // FREESTANDING:#define __STDC_HOSTED__ 0
@@ -2098,7 +2098,7 @@
 // MIPS32BE:#define __SIZE_WIDTH__ 32
 // MIPS32BE-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8U
 // MIPS32BE:#define __STDC_HOSTED__ 0
-// MIPS32BE-C:#define __STDC_VERSION__ 201112L
+// MIPS32BE-C:#define __STDC_VERSION__ 201710L
 // MIPS32BE:#define __STDC__ 1
 // MIPS32BE:#define __UINT16_C_SUFFIX__
 // MIPS32BE:#define __UINT16_MAX__ 65535
@@ -2557,7 +2557,7 @@
 // MIPSN32BE: #define __STDC_HOSTED__ 0
 // MIPSN32BE: #define __STDC_UTF_16__ 1
 // MIPSN32BE: #define __STDC_UTF_32__ 1
-// MIPSN32BE-C: #define __STDC_VERSION__ 201112L
+// MIPSN32BE-C: #define __STDC_VERSION__ 201710L
 // MIPSN32BE: #define __STDC__ 1
 // MIPSN32BE: #define __UINT16_C_SUFFIX__
 // MIPSN32BE: #define __UINT16_FMTX__ "hX"
@@ -2864,7 +2864,7 @@
 // MIPSN32EL: #define __STDC_HOSTED__ 0
 // MIPSN32EL: #define __STDC_UTF_16__ 1
 // MIPSN32EL: #define __STDC_UTF_32__ 1
-// MIPSN32EL: #define __STDC_VERSION__ 201112L
+// MIPSN32EL: #define __STDC_VERSION__ 201710L
 // MIPSN32EL: #define __STDC__ 1
 // MIPSN32EL: #define __UINT16_C_SUFFIX__
 // MIPSN32EL: #define __UINT16_FMTX__ "hX"
@@ -5390,7 +5390,7 @@
 // PPC-DARWIN:#define __SIZE_TYPE__ long unsigned int
 // PPC-DARWIN:#define __SIZE_WIDTH__ 32
 // PPC-DARWIN:#define __STDC_HOSTED__ 0
-// PPC-DARWIN:#define __STDC_VERSION__ 201112L
+// PPC-DARWIN:#define __STDC_VERSION__ 201710L
 // PPC-DARWIN:#define __STDC__ 1
 // PPC-DARWIN:#define __UINT16_C_SUFFIX__
 // PPC-DARWIN:#define __UINT16_MAX__ 65535
@@ -6602,7 +6602,7 @@
 // X86_64-CLOUDABI:#define __STDC_ISO_10646__ 201206L
 // X86_64-CLOUDABI:#define __STDC_UTF_16__ 1
 // X86_64-CLOUDABI:#define __STDC_UTF_32__ 1
-// X86_64-CLOUDABI:#define __STDC_VERSION__ 201112L
+// X86_64-CLOUDABI:#define __STDC_VERSION__ 201710L
 // X86_64-CLOUDABI:#define __STDC__ 1
 // X86_64-CLOUDABI:#define __UINT16_C_SUFFIX__
 // X86_64-CLOUDABI:#define __UINT16_FMTX__ "hX"
@@ -7601,7 +7601,7 @@
 // WEBASSEMBLY-NOT:#define __STDC_NO_THREADS__
 // WEBASSEMBLY-NEXT:#define __STDC_UTF_16__ 1
 // WEBASSEMBLY-NEXT:#define __STDC_UTF_32__ 1
-// WEBASSEMBLY-NEXT:#define __STDC_VERSION__ 201112L
+// WEBASSEMBLY-NEXT:#define __STDC_VERSION__ 201710L
 // WEBASSEMBLY-NEXT:#define __STDC__ 1
 // WEBASSEMBLY-NEXT:#define __UINT16_C_SUFFIX__
 // WEBASSEMBLY-NEXT:#define __UINT16_FMTX__ "hX"
@@ -8166,7 +8166,7 @@
 // RISCV32: #define __STDC_HOSTED__ 0
 // RISCV32: #define __STDC_UTF_16__ 1
 // RISCV32: #define __STDC_UTF_32__ 1
-// RISCV32: #define __STDC_VERSION__ 201112L
+// RISCV32: #define __STDC_VERSION__ 201710L
 // RISCV32: #define __STDC__ 1
 // RISCV32: #define __UINT16_C_SUFFIX__
 // RISCV32: #define __UINT16_MAX__ 65535
@@ -8373,7 +8373,7 @@
 // RISCV64: #define __STDC_HOSTED__ 0
 // RISCV64: #define __STDC_UTF_16__ 1
 // RISCV64: #define __STDC_UTF_32__ 1
-// RISCV64: #define __STDC_VERSION__ 201112L
+// RISCV64: #define __STDC_VERSION__ 201710L
 // RISCV64: #define __STDC__ 1
 // RISCV64: #define __UINT16_C_SUFFIX__
 // RISCV64: #define __UINT16_MAX__ 65535
diff --git a/clang/test/Sema/fallthrough-comment.c b/clang/test/Sema/fallthrough-comment.c
deleted file mode 100644
index 85d1257932f668..00000000000000
--- a/clang/test/Sema/fallthrough-comment.c
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c11 -verify -Wimplicit-fallthrough %s
-
-int fallthrough_comment(int n) {
-  switch (n) {
-  case 0:
-    n++;
-    // FALLTHROUGH
-  case 1:
-    n++;
-
-    /*fall-through.*/
-
-  case 2:
-    n++;
-  case 3: // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '__attribute__((fallthrough));' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
-    n++;
-    break;
-  }
-  return n;
-}
diff --git a/clang/test/Sema/warn-documentation.m b/clang/test/Sema/warn-documentation.m
index c713d5b07f85e9..5d60a52ae6fed7 100644
--- a/clang/test/Sema/warn-documentation.m
+++ b/clang/test/Sema/warn-documentation.m
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -fblocks -Wno-objc-root-class -Wdocumentation -Wdocumentation-pedantic -verify %s
+// RUN: %clang_cc1 -xobjective-c++ -fsyntax-only -fblocks -Wno-objc-root-class -Wdocumentation -Wdocumentation-pedantic -verify %s
 
 @class NSString;
 
@@ -318,3 +319,10 @@ @interface CheckFunctionBlockPointerVars {
 // expected-warning@-1 {{'\return' command used in a comment that is not attached to a function or method declaration}}
 VoidBlockTypeCall ^e; ///< \return none
 // expected-warning@-1 {{'\return' command used in a comment that is not attached to a function or method declaration}}
+
+#ifdef __cplusplus
+@interface HasAnonNamespace @end
+@implementation HasAnonNamespace
+namespace {}
+@end
+#endif
diff --git a/clang/test/SemaObjC/signed-char-bool-conversion.m b/clang/test/SemaObjC/signed-char-bool-conversion.m
index 6945d86fc26d17..183f60fafcd5ad 100644
--- a/clang/test/SemaObjC/signed-char-bool-conversion.m
+++ b/clang/test/SemaObjC/signed-char-bool-conversion.m
@@ -69,6 +69,11 @@ void t3(struct has_bf *bf) {
   b = local.nested->unsigned_bf2; // expected-warning{{implicit conversion from integral type 'unsigned int' to 'BOOL'}}
 }
 
+void t4(BoolProp *bp) {
+  BOOL local = YES;
+  bp.p = 1 ? local : NO; // no warning
+}
+
 __attribute__((objc_root_class))
 @interface BFIvar {
   struct has_bf bf;
diff --git a/clang/test/lit.site.cfg.py.in b/clang/test/lit.site.cfg.py.in
index 39c8b47adf926e..62616d9a2b959b 100644
--- a/clang/test/lit.site.cfg.py.in
+++ b/clang/test/lit.site.cfg.py.in
@@ -16,7 +16,7 @@ config.host_triple = "@LLVM_HOST_TRIPLE@"
 config.target_triple = "@TARGET_TRIPLE@"
 config.host_cxx = "@CMAKE_CXX_COMPILER@"
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
-config.have_zlib = @LLVM_ENABLE_ZLIB@
+config.have_zlib = @HAVE_LIBZ@
 config.clang_arcmt = @CLANG_ENABLE_ARCMT@
 config.clang_default_cxx_stdlib = "@CLANG_DEFAULT_CXX_STDLIB@"
 config.clang_staticanalyzer = @CLANG_ENABLE_STATIC_ANALYZER@
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 6f32240fe6e48b..62dc0e2b8f9265 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2047,6 +2047,7 @@ class EnqueueVisitor : public ConstStmtVisitor<EnqueueVisitor, void> {
   VisitOMPCancellationPointDirective(const OMPCancellationPointDirective *D);
   void VisitOMPCancelDirective(const OMPCancelDirective *D);
   void VisitOMPFlushDirective(const OMPFlushDirective *D);
+  void VisitOMPDepobjDirective(const OMPDepobjDirective *D);
   void VisitOMPOrderedDirective(const OMPOrderedDirective *D);
   void VisitOMPAtomicDirective(const OMPAtomicDirective *D);
   void VisitOMPTargetDirective(const OMPTargetDirective *D);
@@ -2249,6 +2250,8 @@ void OMPClauseEnqueue::VisitOMPSIMDClause(const OMPSIMDClause *) {}
 
 void OMPClauseEnqueue::VisitOMPNogroupClause(const OMPNogroupClause *) {}
 
+void OMPClauseEnqueue::VisitOMPDestroyClause(const OMPDestroyClause *) {}
+
 void OMPClauseEnqueue::VisitOMPUnifiedAddressClause(
     const OMPUnifiedAddressClause *) {}
 
@@ -2444,6 +2447,9 @@ OMPClauseEnqueue::VisitOMPCopyprivateClause(const OMPCopyprivateClause *C) {
 void OMPClauseEnqueue::VisitOMPFlushClause(const OMPFlushClause *C) {
   VisitOMPClauseList(C);
 }
+void OMPClauseEnqueue::VisitOMPDepobjClause(const OMPDepobjClause *C) {
+  Visitor->AddStmt(C->getDepobj());
+}
 void OMPClauseEnqueue::VisitOMPDependClause(const OMPDependClause *C) {
   VisitOMPClauseList(C);
 }
@@ -2871,6 +2877,10 @@ void EnqueueVisitor::VisitOMPFlushDirective(const OMPFlushDirective *D) {
   VisitOMPExecutableDirective(D);
 }
 
+void EnqueueVisitor::VisitOMPDepobjDirective(const OMPDepobjDirective *D) {
+  VisitOMPExecutableDirective(D);
+}
+
 void EnqueueVisitor::VisitOMPOrderedDirective(const OMPOrderedDirective *D) {
   VisitOMPExecutableDirective(D);
 }
@@ -2883,8 +2893,8 @@ void EnqueueVisitor::VisitOMPTargetDirective(const OMPTargetDirective *D) {
   VisitOMPExecutableDirective(D);
 }
 
-void EnqueueVisitor::VisitOMPTargetDataDirective(const 
-                                                 OMPTargetDataDirective *D) {
+void EnqueueVisitor::VisitOMPTargetDataDirective(
+    const OMPTargetDataDirective *D) {
   VisitOMPExecutableDirective(D);
 }
 
@@ -5503,6 +5513,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OMPTaskgroupDirective");
   case CXCursor_OMPFlushDirective:
     return cxstring::createRef("OMPFlushDirective");
+  case CXCursor_OMPDepobjDirective:
+    return cxstring::createRef("OMPDepobjDirective");
   case CXCursor_OMPOrderedDirective:
     return cxstring::createRef("OMPOrderedDirective");
   case CXCursor_OMPAtomicDirective:
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 04b713c68b8079..e10c742c65eae7 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -635,6 +635,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OMPFlushDirectiveClass:
     K = CXCursor_OMPFlushDirective;
     break;
+  case Stmt::OMPDepobjDirectiveClass:
+    K = CXCursor_OMPDepobjDirective;
+    break;
   case Stmt::OMPOrderedDirectiveClass:
     K = CXCursor_OMPOrderedDirective;
     break;
diff --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp
index 0bc49856375b00..d22e0da82321ec 100644
--- a/clang/unittests/Format/FormatTestCSharp.cpp
+++ b/clang/unittests/Format/FormatTestCSharp.cpp
@@ -607,6 +607,7 @@ TEST_F(FormatTestCSharp, CSharpSpaces) {
 
   Style.SpacesInSquareBrackets = true;
   verifyFormat(R"(private float[ , ] Values;)", Style);
+  verifyFormat(R"(string dirPath = args?[ 0 ];)", Style);
 }
 
 TEST_F(FormatTestCSharp, CSharpNullableTypes) {
diff --git a/clang/unittests/Tooling/Syntax/TokensTest.cpp b/clang/unittests/Tooling/Syntax/TokensTest.cpp
index ad0293bc3e072d..d4b015393286bd 100644
--- a/clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -59,6 +59,7 @@ using ::testing::ElementsAre;
 using ::testing::Field;
 using ::testing::Matcher;
 using ::testing::Not;
+using ::testing::Pointee;
 using ::testing::StartsWith;
 
 namespace {
@@ -363,6 +364,12 @@ TEST_F(TokenCollectorTest, Locations) {
                   AllOf(Kind(tok::equal), RangeIs(Code.range("r3"))),
                   AllOf(Kind(tok::string_literal), RangeIs(Code.range("r4"))),
                   AllOf(Kind(tok::semi), RangeIs(Code.range("r5")))));
+
+  auto StartLoc = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID());
+  for (auto &R : Code.ranges()) {
+    EXPECT_THAT(Buffer.spelledTokenAt(StartLoc.getLocWithOffset(R.Begin)),
+                Pointee(RangeIs(R)));
+  }
 }
 
 TEST_F(TokenCollectorTest, MacroDirectives) {
diff --git a/clang/www/compatibility.html b/clang/www/compatibility.html
index 9f8ee4bdc01233..a593155951dae4 100755
--- a/clang/www/compatibility.html
+++ b/clang/www/compatibility.html
@@ -83,7 +83,7 @@ <h2 id="c">C compatibility</h2>
 <!-- ======================================================================= -->
 <h3 id="inline">C99 inline functions</h3>
 <!-- ======================================================================= -->
-<p>By default, Clang builds C code in GNU C11 mode, so it uses standard C99
+<p>By default, Clang builds C code in GNU C17 mode, so it uses standard C99
 semantics for the <code>inline</code> keyword. These semantics are different
 from those in GNU C89 mode, which is the default mode in versions of GCC
 prior to 5.0. For example, consider the following code:</p>
diff --git a/compiler-rt/lib/fuzzer/FuzzerUtil.cpp b/compiler-rt/lib/fuzzer/FuzzerUtil.cpp
index 87180d1ea85d52..7eecb68d0729da 100644
--- a/compiler-rt/lib/fuzzer/FuzzerUtil.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerUtil.cpp
@@ -161,20 +161,21 @@ std::string Base64(const Unit &U) {
 
   size_t i = 0, j = 0;
   for (size_t n = U.size() / 3 * 3; i < n; i += 3, j += 4) {
-    uint32_t x = (U[i] << 16) | (U[i + 1] << 8) | U[i + 2];
+    uint32_t x = ((unsigned char)U[i] << 16) | ((unsigned char)U[i + 1] << 8) |
+                 (unsigned char)U[i + 2];
     Buffer[j + 0] = Table[(x >> 18) & 63];
     Buffer[j + 1] = Table[(x >> 12) & 63];
     Buffer[j + 2] = Table[(x >> 6) & 63];
     Buffer[j + 3] = Table[x & 63];
   }
   if (i + 1 == U.size()) {
-    uint32_t x = (U[i] << 16);
+    uint32_t x = ((unsigned char)U[i] << 16);
     Buffer[j + 0] = Table[(x >> 18) & 63];
     Buffer[j + 1] = Table[(x >> 12) & 63];
     Buffer[j + 2] = '=';
     Buffer[j + 3] = '=';
   } else if (i + 2 == U.size()) {
-    uint32_t x = (U[i] << 16) | (U[i + 1] << 8);
+    uint32_t x = ((unsigned char)U[i] << 16) | ((unsigned char)U[i + 1] << 8);
     Buffer[j + 0] = Table[(x >> 18) & 63];
     Buffer[j + 1] = Table[(x >> 12) & 63];
     Buffer[j + 2] = Table[(x >> 6) & 63];
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 4de8d030070f3c..60464bcdaa877c 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -51,7 +51,7 @@ if config.enable_per_target_runtime_dir:
 else:
   set_default("target_suffix", "-%s" % config.target_arch)
 
-set_default("have_zlib", "@LLVM_ENABLE_ZLIB@")
+set_default("have_zlib", "@HAVE_LIBZ@")
 set_default("libcxx_used", "@LLVM_LIBCXX_USED@")
 
 # LLVM tools dir can be passed in lit parameters, so try to
diff --git a/compiler-rt/test/profile/instrprof-merging.cpp b/compiler-rt/test/profile/instrprof-merging.cpp
index 26c191a715372b..692b049ec45c31 100644
--- a/compiler-rt/test/profile/instrprof-merging.cpp
+++ b/compiler-rt/test/profile/instrprof-merging.cpp
@@ -1,4 +1,3 @@
-// UNSUPPORTED: powerpc64
 // 1) Compile shared code into different object files and into an executable.
 
 // RUN: %clangxx_profgen -std=c++14 -fcoverage-mapping %s -c -o %t.v1.o \
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/nullability.c b/compiler-rt/test/ubsan/TestCases/Misc/nullability.c
index 849d7ee203c62a..50295fe503f9ed 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/nullability.c
+++ b/compiler-rt/test/ubsan/TestCases/Misc/nullability.c
@@ -1,3 +1,4 @@
+// UNSUPPORTED: android
 // RUN: %clang -w -fsanitize=nullability-arg,nullability-assign,nullability-return %s -O3 -o %t
 // RUN: %run %t foo 2>&1 | count 0
 // RUN: %run %t 2>&1 | FileCheck %s
@@ -5,11 +6,7 @@
 // RUN: echo "nullability-arg:nullability.c" > %t.supp
 // RUN: echo "nullability-return:nullability.c" >> %t.supp
 // RUN: echo "nullability-assign:nullability.c" >> %t.supp
-// RUN: UBSAN_OPTIONS=suppressions=%t.supp %run %t 
-//
-// XXX: This test is failing on the sanitizer-x86_64-linux-android, but not
-// in a way that provides debuggable output. Relax the check so we can debug.
-// 2>&1 | FileCheck -allow-empty -check-prefix=SUPPRESS %s
+// RUN: UBSAN_OPTIONS=suppressions=%t.supp %run %t 2>&1 | FileCheck -allow-empty -check-prefix=SUPPRESS %s
 // SUPPRESS-NOT: runtime error
 
 // CHECK: nullability.c:[[@LINE+2]]:41: runtime error: null pointer returned from function declared to never return null
diff --git a/libc/src/signal/linux/CMakeLists.txt b/libc/src/signal/linux/CMakeLists.txt
index 53bf5fc0f56b27..022f41b5a0ebb5 100644
--- a/libc/src/signal/linux/CMakeLists.txt
+++ b/libc/src/signal/linux/CMakeLists.txt
@@ -35,6 +35,7 @@ add_entrypoint_object(
     ../sigemptyset.h
   DEPENDS
     __errno_location
+    errno_h
     signal_h
 )
 
@@ -47,5 +48,6 @@ add_entrypoint_object(
     ../sigaddset.h
   DEPENDS
     __errno_location
+    errno_h
     signal_h
 )
diff --git a/libcxx/include/__config b/libcxx/include/__config
index b14cc84eadeaa5..9bd7fc9932c858 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -102,6 +102,9 @@
 #  define _LIBCPP_ABI_OPTIMIZED_FUNCTION
 // All the regex constants must be distinct and nonzero.
 #  define _LIBCPP_ABI_REGEX_CONSTANTS_NONZERO
+// Re-worked external template instantiations for std::string with a focus on
+// performance and fast-path inlining.
+#  define _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION
 #elif _LIBCPP_ABI_VERSION == 1
 #  if !defined(_LIBCPP_OBJECT_FORMAT_COFF)
 // Enable compiling copies of now inline methods into the dylib to support
diff --git a/libcxx/include/string b/libcxx/include/string
index 7688d3ff29ec96..c2a4220e276a89 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -4381,7 +4381,7 @@ basic_string<_CharT, _Traits, _Allocator>::__subscriptable(const const_iterator*
 
 #endif  // _LIBCPP_DEBUG_LEVEL >= 2
 
-#if defined(_LIBCPP_ABI_UNSTABLE) || _LIBCPP_ABI_VERSION >= 2
+#ifdef _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION
 _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST(_LIBCPP_EXTERN_TEMPLATE, char)
 _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST(_LIBCPP_EXTERN_TEMPLATE, wchar_t)
 #else
diff --git a/libcxx/src/string.cpp b/libcxx/src/string.cpp
index 0345170a70ee8a..5105594cf38b8d 100644
--- a/libcxx/src/string.cpp
+++ b/libcxx/src/string.cpp
@@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __basic_string_common<true>;
 
-#if defined(_LIBCPP_ABI_UNSTABLE) || _LIBCPP_ABI_VERSION >= 2
+#ifdef _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION
 _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST(_LIBCPP_EXTERN_TEMPLATE_DEFINE, char)
 _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST(_LIBCPP_EXTERN_TEMPLATE_DEFINE, wchar_t)
 #else
diff --git a/libcxx/utils/merge_archives.py b/libcxx/utils/merge_archives.py
index 4c31854d2b7e35..cc96cb2aa50ce5 100755
--- a/libcxx/utils/merge_archives.py
+++ b/libcxx/utils/merge_archives.py
@@ -143,7 +143,7 @@ def main():
 
     if args.use_libtool:
         files = [f for f in files if not f.startswith('__.SYMDEF')]
-        execute_command_verbose([libtool_exe, '-static', '-o', args.output] + files,
+        execute_command_verbose([libtool_exe, '-static', '-o', args.output, '-s'] + files,
                                 cwd=temp_directory_root, verbose=args.verbose)
     else:
         execute_command_verbose([ar_exe, 'rcs', args.output] + files,
diff --git a/lld/docs/WebAssembly.rst b/lld/docs/WebAssembly.rst
index 13ed0aeb94d4cc..b23f2cd462b4be 100644
--- a/lld/docs/WebAssembly.rst
+++ b/lld/docs/WebAssembly.rst
@@ -137,7 +137,7 @@ By default no undefined symbols are allowed in the final binary.  The flag
 ``--allow-undefined`` results in a WebAssembly import being defined for each
 undefined symbol.  It is then up to the runtime to provide such symbols.
 
-Alternativly symbols can be marked in the source code as with the
+Alternatively symbols can be marked in the source code as with the
 ``import_name`` and/or ``import_module`` clang attributes which signals that
 they are expected to be undefined at static link time.
 
diff --git a/lld/test/CMakeLists.txt b/lld/test/CMakeLists.txt
index dc8cedf2ea095e..8be42c46dd8ad4 100644
--- a/lld/test/CMakeLists.txt
+++ b/lld/test/CMakeLists.txt
@@ -4,8 +4,17 @@ set(LLVM_BUILD_MODE "%(build_mode)s")
 set(LLVM_TOOLS_DIR "${LLVM_TOOLS_BINARY_DIR}/%(build_config)s")
 set(LLVM_LIBS_DIR "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/%(build_config)s")
 
+if(LLD_BUILT_STANDALONE)
+  # Set HAVE_LIBZ according to recorded LLVM_ENABLE_ZLIB value. This
+  # value is forced to 0 if zlib was not found, so it is fine to use it
+  # instead of HAVE_LIBZ (not recorded).
+  if(LLVM_ENABLE_ZLIB)
+    set(HAVE_LIBZ 1)
+  endif()
+endif()
+
 llvm_canonicalize_cmake_booleans(
-  LLVM_ENABLE_ZLIB
+  HAVE_LIBZ
   LLVM_LIBXML2_ENABLED
   )
 
diff --git a/lld/test/ELF/lto/resolution-err.ll b/lld/test/ELF/lto/resolution-err.ll
new file mode 100644
index 00000000000000..00cdd94059ac87
--- /dev/null
+++ b/lld/test/ELF/lto/resolution-err.ll
@@ -0,0 +1,16 @@
+; UNSUPPORTED: system-windows
+; REQUIRES: shell
+; RUN: llvm-as %s -o %t.bc
+; RUN: touch %t.resolution.txt
+; RUN: chmod -w %t.resolution.txt
+; RUN: not ld.lld -save-temps %t.bc -o %t 2>&1 | FileCheck %s
+; RUN: rm -f %t.resolution.txt
+
+; CHECK: error: {{[Pp]}}ermission denied{{$}}
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @_start() {
+  ret void
+}
diff --git a/lld/test/lit.site.cfg.py.in b/lld/test/lit.site.cfg.py.in
index 531fce15839d52..02840f8d6a3036 100644
--- a/lld/test/lit.site.cfg.py.in
+++ b/lld/test/lit.site.cfg.py.in
@@ -14,7 +14,7 @@ config.lld_libs_dir = "@LLVM_LIBRARY_OUTPUT_INTDIR@"
 config.lld_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@"
 config.target_triple = "@TARGET_TRIPLE@"
 config.python_executable = "@PYTHON_EXECUTABLE@"
-config.have_zlib = @LLVM_ENABLE_ZLIB@
+config.have_zlib = @HAVE_LIBZ@
 config.sizeof_void_p = @CMAKE_SIZEOF_VOID_P@
 
 # Support substitution of the tools and libs dirs with user parameters. This is
diff --git a/lldb/docs/conf.py b/lldb/docs/conf.py
index bd95cbe6cd98cd..ca1d6f79092179 100644
--- a/lldb/docs/conf.py
+++ b/lldb/docs/conf.py
@@ -46,12 +46,14 @@
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
-# built documents.
+# built documents. These are currently set to zero because we don't use them.
+# Should somebody consider in the future to change them, they need to be updated
+# everytime a new release comes out.
 #
 # The short version.
-version = '8'
+#version = '0'
 # The full version, including alpha/beta/rc tags.
-release = '8'
+#release = '0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst
index f1e1eda7609aa4..29f63b32838116 100644
--- a/lldb/docs/index.rst
+++ b/lldb/docs/index.rst
@@ -3,7 +3,7 @@
 The LLDB Debugger
 =================
 
-Welcome to the LLDB version |release| documentation!
+Welcome to the LLDB documentation!
 
 LLDB is a next generation, high-performance debugger. It is built as a set of
 reusable components which highly leverage existing libraries in the larger LLVM
diff --git a/lldb/include/lldb/Core/Disassembler.h b/lldb/include/lldb/Core/Disassembler.h
index 98f34f3e0cfa2e..521c8be2bbf8cc 100644
--- a/lldb/include/lldb/Core/Disassembler.h
+++ b/lldb/include/lldb/Core/Disassembler.h
@@ -446,13 +446,11 @@ class Disassembler : public std::enable_shared_from_this<Disassembler>,
                                 uint32_t num_mixed_context_lines,
                                 uint32_t options, Stream &strm);
 
-  size_t ParseInstructions(const ExecutionContext *exe_ctx,
-                           const AddressRange &range, Stream *error_strm_ptr,
-                           bool prefer_file_cache);
+  size_t ParseInstructions(Target &target, const AddressRange &range,
+                           Stream *error_strm_ptr, bool prefer_file_cache);
 
-  size_t ParseInstructions(const ExecutionContext *exe_ctx,
-                           const Address &range, uint32_t num_instructions,
-                           bool prefer_file_cache);
+  size_t ParseInstructions(Target &target, const Address &range,
+                           uint32_t num_instructions, bool prefer_file_cache);
 
   virtual size_t DecodeInstructions(const Address &base_addr,
                                     const DataExtractor &data,
diff --git a/lldb/include/lldb/Utility/RangeMap.h b/lldb/include/lldb/Utility/RangeMap.h
index 53fb691323a68b..fb24c5a434792f 100644
--- a/lldb/include/lldb/Utility/RangeMap.h
+++ b/lldb/include/lldb/Utility/RangeMap.h
@@ -394,19 +394,31 @@ struct RangeData : public Range<B, S> {
   RangeData(B base, S size, DataType d) : Range<B, S>(base, size), data(d) {}
 };
 
+// We can treat the vector as a flattened Binary Search Tree, augmenting it
+// with upper bounds (max of range endpoints) for every index allows us to
+// query for range containment quicker.
+template <typename B, typename S, typename T>
+struct AugmentedRangeData : public RangeData<B, S, T> {
+  B upper_bound;
+
+  AugmentedRangeData(const RangeData<B, S, T> &rd)
+      : RangeData<B, S, T>(rd), upper_bound() {}
+};
+
 template <typename B, typename S, typename T, unsigned N = 0,
           class Compare = std::less<T>>
 class RangeDataVector {
 public:
   typedef lldb_private::Range<B, S> Range;
   typedef RangeData<B, S, T> Entry;
-  typedef llvm::SmallVector<Entry, N> Collection;
+  typedef AugmentedRangeData<B, S, T> AugmentedEntry;
+  typedef llvm::SmallVector<AugmentedEntry, N> Collection;
 
   RangeDataVector(Compare compare = Compare()) : m_compare(compare) {}
 
   ~RangeDataVector() = default;
 
-  void Append(const Entry &entry) { m_entries.push_back(entry); }
+  void Append(const Entry &entry) { m_entries.emplace_back(entry); }
 
   void Sort() {
     if (m_entries.size() > 1)
@@ -418,13 +430,13 @@ class RangeDataVector {
                            return a.size < b.size;
                          return compare(a.data, b.data);
                        });
+    if (!m_entries.empty())
+      ComputeUpperBounds(0, m_entries.size());
   }
 
 #ifdef ASSERT_RANGEMAP_ARE_SORTED
   bool IsSorted() const {
     typename Collection::const_iterator pos, end, prev;
-    // First we determine if we can combine any of the Entry objects so we
-    // don't end up allocating and making a new collection for no reason
     for (pos = m_entries.begin(), end = m_entries.end(), prev = end; pos != end;
          prev = pos++) {
       if (prev != end && *pos < *prev)
@@ -494,26 +506,20 @@ class RangeDataVector {
   }
 
   uint32_t FindEntryIndexThatContains(B addr) const {
-    const Entry *entry = FindEntryThatContains(addr);
+    const AugmentedEntry *entry =
+        static_cast<const AugmentedEntry *>(FindEntryThatContains(addr));
     if (entry)
       return std::distance(m_entries.begin(), entry);
     return UINT32_MAX;
   }
 
-  uint32_t FindEntryIndexesThatContain(B addr,
-                                       std::vector<uint32_t> &indexes) const {
+  uint32_t FindEntryIndexesThatContain(B addr, std::vector<uint32_t> &indexes) {
 #ifdef ASSERT_RANGEMAP_ARE_SORTED
     assert(IsSorted());
 #endif
-    // Search the entries until the first entry that has a larger base address
-    // than `addr`. As m_entries is sorted by their base address, all following
-    // entries can't contain `addr` as their base address is already larger.
-    for (const auto &entry : m_entries) {
-      if (entry.Contains(addr))
-        indexes.push_back(entry.data);
-      else if (entry.GetRangeBase() > addr)
-        break;
-    }
+    if (!m_entries.empty())
+      FindEntryIndexesThatContain(addr, 0, m_entries.size(), indexes);
+
     return indexes.size();
   }
 
@@ -599,6 +605,54 @@ class RangeDataVector {
 protected:
   Collection m_entries;
   Compare m_compare;
+
+private:
+  // Compute extra information needed for search
+  B ComputeUpperBounds(size_t lo, size_t hi) {
+    size_t mid = (lo + hi) / 2;
+    AugmentedEntry &entry = m_entries[mid];
+
+    entry.upper_bound = entry.base + entry.size;
+
+    if (lo < mid)
+      entry.upper_bound =
+          std::max(entry.upper_bound, ComputeUpperBounds(lo, mid));
+
+    if (mid + 1 < hi)
+      entry.upper_bound =
+          std::max(entry.upper_bound, ComputeUpperBounds(mid + 1, hi));
+
+    return entry.upper_bound;
+  }
+
+  // This is based on the augmented tree implementation found at
+  // https://en.wikipedia.org/wiki/Interval_tree#Augmented_tree
+  void FindEntryIndexesThatContain(B addr, size_t lo, size_t hi,
+                                   std::vector<uint32_t> &indexes) {
+    size_t mid = (lo + hi) / 2;
+    const AugmentedEntry &entry = m_entries[mid];
+
+    // addr is greater than the rightmost point of any interval below mid
+    // so there are cannot be any matches.
+    if (addr > entry.upper_bound)
+      return;
+
+    // Recursively search left subtree
+    if (lo < mid)
+      FindEntryIndexesThatContain(addr, lo, mid, indexes);
+
+    // If addr is smaller than the start of the current interval it
+    // cannot contain it nor can any of its right subtree.
+    if (addr < entry.base)
+      return;
+
+    if (entry.Contains(addr))
+      indexes.push_back(entry.data);
+
+    // Recursively search right subtree
+    if (mid + 1 < hi)
+      FindEntryIndexesThatContain(addr, mid + 1, hi, indexes);
+  }
 };
 
 // A simple range  with data class where you get to define the type of
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index ad509b81d2bf7d..c7786534076e3f 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -40,7 +40,6 @@
 #include "lldb/Target/ThreadPlanStepInstruction.h"
 #include "lldb/Target/ThreadPlanStepOut.h"
 #include "lldb/Target/ThreadPlanStepRange.h"
-#include "lldb/Target/UnixSignals.h"
 #include "lldb/Utility/State.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StructuredData.h"
@@ -319,97 +318,26 @@ size_t SBThread::GetStopDescription(char *dst, size_t dst_len) {
   std::unique_lock<std::recursive_mutex> lock;
   ExecutionContext exe_ctx(m_opaque_sp.get(), lock);
 
-  if (exe_ctx.HasThreadScope()) {
-    Process::StopLocker stop_locker;
-    if (stop_locker.TryLock(&exe_ctx.GetProcessPtr()->GetRunLock())) {
+  if (dst)
+    *dst = 0;
 
-      StopInfoSP stop_info_sp = exe_ctx.GetThreadPtr()->GetStopInfo();
-      if (stop_info_sp) {
-        std::string thread_stop_desc =
-            exe_ctx.GetThreadPtr()->GetStopDescription();
-        const char *stop_desc = thread_stop_desc.c_str();
-
-        if (stop_desc[0] != '\0') {
-          if (dst)
-            return ::snprintf(dst, dst_len, "%s", stop_desc);
-          else {
-            // NULL dst passed in, return the length needed to contain the
-            // description
-            return ::strlen(stop_desc) + 1; // Include the NULL byte for size
-          }
-        } else {
-          size_t stop_desc_len = 0;
-          switch (stop_info_sp->GetStopReason()) {
-          case eStopReasonTrace:
-          case eStopReasonPlanComplete: {
-            static char trace_desc[] = "step";
-            stop_desc = trace_desc;
-            stop_desc_len =
-                sizeof(trace_desc); // Include the NULL byte for size
-          } break;
-
-          case eStopReasonBreakpoint: {
-            static char bp_desc[] = "breakpoint hit";
-            stop_desc = bp_desc;
-            stop_desc_len = sizeof(bp_desc); // Include the NULL byte for size
-          } break;
-
-          case eStopReasonWatchpoint: {
-            static char wp_desc[] = "watchpoint hit";
-            stop_desc = wp_desc;
-            stop_desc_len = sizeof(wp_desc); // Include the NULL byte for size
-          } break;
-
-          case eStopReasonSignal: {
-            stop_desc =
-                exe_ctx.GetProcessPtr()->GetUnixSignals()->GetSignalAsCString(
-                    stop_info_sp->GetValue());
-            if (stop_desc == nullptr || stop_desc[0] == '\0') {
-              static char signal_desc[] = "signal";
-              stop_desc = signal_desc;
-              stop_desc_len =
-                  sizeof(signal_desc); // Include the NULL byte for size
-            }
-          } break;
-
-          case eStopReasonException: {
-            char exc_desc[] = "exception";
-            stop_desc = exc_desc;
-            stop_desc_len = sizeof(exc_desc); // Include the NULL byte for size
-          } break;
-
-          case eStopReasonExec: {
-            char exc_desc[] = "exec";
-            stop_desc = exc_desc;
-            stop_desc_len = sizeof(exc_desc); // Include the NULL byte for size
-          } break;
-
-          case eStopReasonThreadExiting: {
-            char limbo_desc[] = "thread exiting";
-            stop_desc = limbo_desc;
-            stop_desc_len = sizeof(limbo_desc);
-          } break;
-          default:
-            break;
-          }
+  if (!exe_ctx.HasThreadScope())
+    return 0;
 
-          if (stop_desc && stop_desc[0]) {
-            if (dst)
-              return ::snprintf(dst, dst_len, "%s", stop_desc) +
-                     1; // Include the NULL byte
+  Process::StopLocker stop_locker;
+  if (!stop_locker.TryLock(&exe_ctx.GetProcessPtr()->GetRunLock()))
+    return 0;
 
-            if (stop_desc_len == 0)
-              stop_desc_len = ::strlen(stop_desc) + 1; // Include the NULL byte
+  std::string thread_stop_desc = exe_ctx.GetThreadPtr()->GetStopDescription();
+  if (thread_stop_desc.empty())
+    return 0;
 
-            return stop_desc_len;
-          }
-        }
-      }
-    }
-  }
   if (dst)
-    *dst = 0;
-  return 0;
+    return ::snprintf(dst, dst_len, "%s", thread_stop_desc.c_str()) + 1;
+
+  // NULL dst passed in, return the length needed to contain the
+  // description.
+  return thread_stop_desc.size() + 1; // Include the NULL byte for size
 }
 
 SBValue SBThread::GetStopReturnValue() {
diff --git a/lldb/source/Core/Disassembler.cpp b/lldb/source/Core/Disassembler.cpp
index 60247cfdd99e01..268e25fb6697ba 100644
--- a/lldb/source/Core/Disassembler.cpp
+++ b/lldb/source/Core/Disassembler.cpp
@@ -193,7 +193,7 @@ lldb::DisassemblerSP Disassembler::DisassembleRange(
     const ArchSpec &arch, const char *plugin_name, const char *flavor,
     const ExecutionContext &exe_ctx, const AddressRange &range,
     bool prefer_file_cache) {
-  if (range.GetByteSize() <= 0)
+  if (range.GetByteSize() <= 0 || !exe_ctx.GetTargetPtr())
     return {};
 
   if (!range.GetBaseAddress().IsValid())
@@ -205,8 +205,8 @@ lldb::DisassemblerSP Disassembler::DisassembleRange(
   if (!disasm_sp)
     return {};
 
-  const size_t bytes_disassembled =
-      disasm_sp->ParseInstructions(&exe_ctx, range, nullptr, prefer_file_cache);
+  const size_t bytes_disassembled = disasm_sp->ParseInstructions(
+      exe_ctx.GetTargetRef(), range, nullptr, prefer_file_cache);
   if (bytes_disassembled == 0)
     return {};
 
@@ -243,7 +243,7 @@ bool Disassembler::Disassemble(Debugger &debugger, const ArchSpec &arch,
                                bool mixed_source_and_assembly,
                                uint32_t num_mixed_context_lines,
                                uint32_t options, Stream &strm) {
-  if (!disasm_range.GetByteSize())
+  if (!disasm_range.GetByteSize() || !exe_ctx.GetTargetPtr())
     return false;
 
   lldb::DisassemblerSP disasm_sp(Disassembler::FindPluginForTarget(
@@ -257,8 +257,8 @@ bool Disassembler::Disassemble(Debugger &debugger, const ArchSpec &arch,
                  range.GetBaseAddress());
   range.SetByteSize(disasm_range.GetByteSize());
   const bool prefer_file_cache = false;
-  size_t bytes_disassembled =
-      disasm_sp->ParseInstructions(&exe_ctx, range, &strm, prefer_file_cache);
+  size_t bytes_disassembled = disasm_sp->ParseInstructions(
+      exe_ctx.GetTargetRef(), range, &strm, prefer_file_cache);
   if (bytes_disassembled == 0)
     return false;
 
@@ -275,7 +275,7 @@ bool Disassembler::Disassemble(Debugger &debugger, const ArchSpec &arch,
                                bool mixed_source_and_assembly,
                                uint32_t num_mixed_context_lines,
                                uint32_t options, Stream &strm) {
-  if (num_instructions == 0)
+  if (num_instructions == 0 || !exe_ctx.GetTargetPtr())
     return false;
 
   lldb::DisassemblerSP disasm_sp(Disassembler::FindPluginForTarget(
@@ -288,7 +288,7 @@ bool Disassembler::Disassemble(Debugger &debugger, const ArchSpec &arch,
 
   const bool prefer_file_cache = false;
   size_t bytes_disassembled = disasm_sp->ParseInstructions(
-      &exe_ctx, addr, num_instructions, prefer_file_cache);
+      exe_ctx.GetTargetRef(), addr, num_instructions, prefer_file_cache);
   if (bytes_disassembled == 0)
     return false;
 
@@ -1182,59 +1182,51 @@ InstructionList::GetIndexOfInstructionAtLoadAddress(lldb::addr_t load_addr,
   return GetIndexOfInstructionAtAddress(address);
 }
 
-size_t Disassembler::ParseInstructions(const ExecutionContext *exe_ctx,
+size_t Disassembler::ParseInstructions(Target &target,
                                        const AddressRange &range,
                                        Stream *error_strm_ptr,
                                        bool prefer_file_cache) {
-  if (exe_ctx) {
-    Target *target = exe_ctx->GetTargetPtr();
-    const addr_t byte_size = range.GetByteSize();
-    if (target == nullptr || byte_size == 0 ||
-        !range.GetBaseAddress().IsValid())
-      return 0;
-
-    auto data_sp = std::make_shared<DataBufferHeap>(byte_size, '\0');
-
-    Status error;
-    lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
-    const size_t bytes_read = target->ReadMemory(
-        range.GetBaseAddress(), prefer_file_cache, data_sp->GetBytes(),
-        data_sp->GetByteSize(), error, &load_addr);
-
-    if (bytes_read > 0) {
-      if (bytes_read != data_sp->GetByteSize())
-        data_sp->SetByteSize(bytes_read);
-      DataExtractor data(data_sp, m_arch.GetByteOrder(),
-                         m_arch.GetAddressByteSize());
-      const bool data_from_file = load_addr == LLDB_INVALID_ADDRESS;
-      return DecodeInstructions(range.GetBaseAddress(), data, 0, UINT32_MAX,
-                                false, data_from_file);
-    } else if (error_strm_ptr) {
-      const char *error_cstr = error.AsCString();
-      if (error_cstr) {
-        error_strm_ptr->Printf("error: %s\n", error_cstr);
-      }
-    }
+  const addr_t byte_size = range.GetByteSize();
+  if (byte_size == 0 || !range.GetBaseAddress().IsValid())
+    return 0;
+
+  auto data_sp = std::make_shared<DataBufferHeap>(byte_size, '\0');
+
+  Status error;
+  lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
+  const size_t bytes_read = target.ReadMemory(
+      range.GetBaseAddress(), prefer_file_cache, data_sp->GetBytes(),
+      data_sp->GetByteSize(), error, &load_addr);
+
+  if (bytes_read > 0) {
+    if (bytes_read != data_sp->GetByteSize())
+      data_sp->SetByteSize(bytes_read);
+    DataExtractor data(data_sp, m_arch.GetByteOrder(),
+                       m_arch.GetAddressByteSize());
+    const bool data_from_file = load_addr == LLDB_INVALID_ADDRESS;
+    return DecodeInstructions(range.GetBaseAddress(), data, 0, UINT32_MAX,
+                              false, data_from_file);
   } else if (error_strm_ptr) {
-    error_strm_ptr->PutCString("error: invalid execution context\n");
+    const char *error_cstr = error.AsCString();
+    if (error_cstr) {
+      error_strm_ptr->Printf("error: %s\n", error_cstr);
+    }
   }
   return 0;
 }
 
-size_t Disassembler::ParseInstructions(const ExecutionContext *exe_ctx,
-                                       const Address &start,
+size_t Disassembler::ParseInstructions(Target &target, const Address &start,
                                        uint32_t num_instructions,
                                        bool prefer_file_cache) {
   m_instruction_list.Clear();
 
-  if (exe_ctx == nullptr || num_instructions == 0 || !start.IsValid())
+  if (num_instructions == 0 || !start.IsValid())
     return 0;
 
-  Target *target = exe_ctx->GetTargetPtr();
   // Calculate the max buffer size we will need in order to disassemble
   const addr_t byte_size = num_instructions * m_arch.GetMaximumOpcodeByteSize();
 
-  if (target == nullptr || byte_size == 0)
+  if (byte_size == 0)
     return 0;
 
   DataBufferHeap *heap_buffer = new DataBufferHeap(byte_size, '\0');
@@ -1243,8 +1235,8 @@ size_t Disassembler::ParseInstructions(const ExecutionContext *exe_ctx,
   Status error;
   lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
   const size_t bytes_read =
-      target->ReadMemory(start, prefer_file_cache, heap_buffer->GetBytes(),
-                         byte_size, error, &load_addr);
+      target.ReadMemory(start, prefer_file_cache, heap_buffer->GetBytes(),
+                        byte_size, error, &load_addr);
 
   const bool data_from_file = load_addr == LLDB_INVALID_ADDRESS;
 
diff --git a/lldb/source/Core/IOHandler.cpp b/lldb/source/Core/IOHandler.cpp
index 939c87c1b15ec1..933da6bfbf75f2 100644
--- a/lldb/source/Core/IOHandler.cpp
+++ b/lldb/source/Core/IOHandler.cpp
@@ -125,6 +125,8 @@ void IOHandlerStack::PrintAsync(Stream *stream, const char *s, size_t len) {
     std::lock_guard<std::recursive_mutex> guard(m_mutex);
     if (m_top)
       m_top->PrintAsync(stream, s, len);
+    else
+      stream->Write(s, len);
   }
 }
 
diff --git a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp
index f426ac63e4b535..e3d1aa3b11dd03 100644
--- a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp
+++ b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp
@@ -120,9 +120,7 @@ lldb::addr_t ArchitectureMips::GetBreakableLoadAddress(lldb::addr_t addr,
   if (current_offset == 0)
     return addr;
 
-  ExecutionContext ctx;
-  target.CalculateExecutionContext(ctx);
-  auto insn = GetInstructionAtAddress(ctx, current_offset, addr);
+  auto insn = GetInstructionAtAddress(target, current_offset, addr);
 
   if (nullptr == insn || !insn->HasDelaySlot())
     return addr;
@@ -138,8 +136,7 @@ lldb::addr_t ArchitectureMips::GetBreakableLoadAddress(lldb::addr_t addr,
 }
 
 Instruction *ArchitectureMips::GetInstructionAtAddress(
-    const ExecutionContext &exe_ctx, const Address &resolved_addr,
-    addr_t symbol_offset) const {
+    Target &target, const Address &resolved_addr, addr_t symbol_offset) const {
 
   auto loop_count = symbol_offset / 2;
 
@@ -174,7 +171,7 @@ Instruction *ArchitectureMips::GetInstructionAtAddress(
     AddressRange range(addr, i * 2);
     uint32_t insn_size = 0;
 
-    disasm_sp->ParseInstructions(&exe_ctx, range, nullptr, prefer_file_cache);
+    disasm_sp->ParseInstructions(target, range, nullptr, prefer_file_cache);
 
     uint32_t num_insns = disasm_sp->GetInstructionList().GetSize();
     if (num_insns) {
diff --git a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.h b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.h
index 40bcc23fd8cd4d..71ee60184b6955 100644
--- a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.h
+++ b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.h
@@ -35,11 +35,10 @@ class ArchitectureMips : public Architecture {
                                     AddressClass addr_class) const override;
 
 private:
-  Instruction *GetInstructionAtAddress(const ExecutionContext &exe_ctx,
+  Instruction *GetInstructionAtAddress(Target &target,
                                        const Address &resolved_addr,
                                        lldb::addr_t symbol_offset) const;
 
-
   static std::unique_ptr<Architecture> Create(const ArchSpec &arch);
   ArchitectureMips(const ArchSpec &arch) : m_arch(arch) {}
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 5440367fd71770..1ed3e693d8d21d 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -50,7 +50,7 @@
 #include <compression.h>
 #endif
 
-#if LLVM_ENABLE_ZLIB
+#if defined(HAVE_LIBZ)
 #include <zlib.h>
 #endif
 
@@ -582,7 +582,7 @@ bool GDBRemoteCommunication::DecompressPacket() {
   }
 #endif
 
-#if LLVM_ENABLE_ZLIB
+#if defined(HAVE_LIBZ)
   if (decompressed_bytes == 0 && decompressed_bufsize != ULONG_MAX &&
       decompressed_buffer != nullptr &&
       m_compression_type == CompressionType::ZlibDeflate) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 67e5d59d199ecb..6021c2664b0678 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -1053,7 +1053,7 @@ void GDBRemoteCommunicationClient::MaybeEnableCompression(
   }
 #endif
 
-#if LLVM_ENABLE_ZLIB
+#if defined(HAVE_LIBZ)
   if (avail_type == CompressionType::None) {
     for (auto compression : supported_compressions) {
       if (compression == "zlib-deflate") {
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 156f6f7f4fc95d..72907a95f3ab3a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -3123,7 +3123,7 @@ Status ProcessGDBRemote::EnableBreakpointSite(BreakpointSite *bp_site) {
     if (m_gdb_comm.SupportsGDBStoppointPacket(eBreakpointSoftware)) {
       if (error_no != UINT8_MAX)
         error.SetErrorStringWithFormat(
-            "error: %d sending the breakpoint request", errno);
+            "error: %d sending the breakpoint request", error_no);
       else
         error.SetErrorString("error sending the breakpoint request");
       return error;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index c89ccb5bf96056..c27b5c4c349541 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -324,10 +324,8 @@ void SymbolFileDWARF::GetTypes(const DWARFDIE &die, dw_offset_t min_die_offset,
       if (add_type) {
         const bool assert_not_being_parsed = true;
         Type *type = ResolveTypeUID(die, assert_not_being_parsed);
-        if (type) {
-          if (type_set.find(type) == type_set.end())
-            type_set.insert(type);
-        }
+        if (type)
+          type_set.insert(type);
       }
     }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index a3928c8c3dd490..479235c0d86f9a 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -12,11 +12,11 @@
 #include <list>
 #include <map>
 #include <mutex>
-#include <set>
 #include <unordered_map>
 #include <vector>
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Support/Threading.h"
 
 #include "lldb/Core/UniqueCStringMap.h"
@@ -439,7 +439,7 @@ class SymbolFileDWARF : public lldb_private::SymbolFile,
 
   bool FixupAddress(lldb_private::Address &addr);
 
-  typedef std::set<lldb_private::Type *> TypeSet;
+  typedef llvm::SetVector<lldb_private::Type *> TypeSet;
 
   void GetTypes(const DWARFDIE &die, dw_offset_t min_die_offset,
                 dw_offset_t max_die_offset, uint32_t type_mask,
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 845318471fa15f..ffcab238d09de1 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -84,12 +84,14 @@ using llvm::StringSwitch;
 LLDB_PLUGIN_DEFINE(TypeSystemClang)
 
 namespace {
-#ifdef LLDB_CONFIGURATION_DEBUG
 static void VerifyDecl(clang::Decl *decl) {
   assert(decl && "VerifyDecl called with nullptr?");
+#ifndef NDEBUG
+  // We don't care about the actual access value here but only want to trigger
+  // that Clang calls its internal Decl::AccessDeclContextSanity check.
   decl->getAccess();
-}
 #endif
+}
 
 static inline bool
 TypeSystemClangSupportsLanguage(lldb::LanguageType language) {
@@ -1415,9 +1417,7 @@ ClassTemplateDecl *TypeSystemClang::CreateClassTemplateDecl(
 
     decl_ctx->addDecl(class_template_decl);
 
-#ifdef LLDB_CONFIGURATION_DEBUG
     VerifyDecl(class_template_decl);
-#endif
   }
 
   return class_template_decl;
@@ -1687,9 +1687,7 @@ NamespaceDecl *TypeSystemClang::GetUniqueNamespaceDeclaration(
       }
     }
   }
-#ifdef LLDB_CONFIGURATION_DEBUG
   VerifyDecl(namespace_decl);
-#endif
   return namespace_decl;
 }
 
@@ -1892,9 +1890,7 @@ FunctionDecl *TypeSystemClang::CreateFunctionDeclaration(
   if (func_decl)
     decl_ctx->addDecl(func_decl);
 
-#ifdef LLDB_CONFIGURATION_DEBUG
   VerifyDecl(func_decl);
-#endif
 
   return func_decl;
 }
@@ -6937,9 +6933,7 @@ clang::FieldDecl *TypeSystemClang::AddFieldToRecordType(
 
       record_decl->addDecl(field);
 
-#ifdef LLDB_CONFIGURATION_DEBUG
       VerifyDecl(field);
-#endif
     }
   } else {
     clang::ObjCInterfaceDecl *class_interface_decl =
@@ -6962,9 +6956,7 @@ clang::FieldDecl *TypeSystemClang::AddFieldToRecordType(
       if (field) {
         class_interface_decl->addDecl(field);
 
-#ifdef LLDB_CONFIGURATION_DEBUG
         VerifyDecl(field);
-#endif
       }
     }
   }
@@ -7128,9 +7120,7 @@ clang::VarDecl *TypeSystemClang::AddVariableToRecordType(
       TypeSystemClang::ConvertAccessTypeToAccessSpecifier(access));
   record_decl->addDecl(var_decl);
 
-#ifdef LLDB_CONFIGURATION_DEBUG
   VerifyDecl(var_decl);
-#endif
 
   return var_decl;
 }
@@ -7310,9 +7300,7 @@ clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType(
     }
   }
 
-#ifdef LLDB_CONFIGURATION_DEBUG
   VerifyDecl(cxx_method_decl);
-#endif
 
   return cxx_method_decl;
 }
@@ -7704,9 +7692,7 @@ clang::ObjCMethodDecl *TypeSystemClang::AddMethodToObjCObjectType(
 
   class_interface_decl->addDecl(objc_method_decl);
 
-#ifdef LLDB_CONFIGURATION_DEBUG
   VerifyDecl(objc_method_decl);
-#endif
 
   return objc_method_decl;
 }
@@ -7904,10 +7890,7 @@ clang::EnumConstantDecl *TypeSystemClang::AddEnumerationValueToEnumerationType(
 
   enutype->getDecl()->addDecl(enumerator_decl);
 
-#ifdef LLDB_CONFIGURATION_DEBUG
   VerifyDecl(enumerator_decl);
-#endif
-
   return enumerator_decl;
 }
 
diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt
index 893065442e8064..2d9274ec52cacb 100644
--- a/lldb/source/Target/CMakeLists.txt
+++ b/lldb/source/Target/CMakeLists.txt
@@ -79,6 +79,7 @@ add_lldb_library(lldbTarget
 
   LINK_COMPONENTS
     Support
+    MC
   )
 
 add_dependencies(lldbTarget
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 00f8b5ae276e54..60d5617053ec9b 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -596,8 +596,12 @@ std::string Thread::GetStopDescription() {
 std::string Thread::GetStopDescriptionRaw() {
   StopInfoSP stop_info_sp = GetStopInfo();
   std::string raw_stop_description;
-  if (stop_info_sp && stop_info_sp->IsValid())
+  if (stop_info_sp && stop_info_sp->IsValid()) {
     raw_stop_description = stop_info_sp->GetDescription();
+    assert((!raw_stop_description.empty() ||
+            stop_info_sp->GetStopReason() == eStopReasonNone) &&
+           "StopInfo returned an empty description.");
+  }
   return raw_stop_description;
 }
 
diff --git a/lldb/source/Utility/Broadcaster.cpp b/lldb/source/Utility/Broadcaster.cpp
index 90f91b4f89cfc8..342548c0b0e616 100644
--- a/lldb/source/Utility/Broadcaster.cpp
+++ b/lldb/source/Utility/Broadcaster.cpp
@@ -373,8 +373,8 @@ bool BroadcasterManager::UnregisterListenerForEvents(
 
     if (event_bits_to_remove != iter_event_bits) {
       uint32_t new_event_bits = iter_event_bits & ~event_bits_to_remove;
-      to_be_readded.push_back(
-          BroadcastEventSpec(event_spec.GetBroadcasterClass(), new_event_bits));
+      to_be_readded.emplace_back(event_spec.GetBroadcasterClass(),
+                                 new_event_bits);
     }
     m_event_map.erase(iter);
   }
diff --git a/lldb/test/Shell/Commands/command-thread-select.test b/lldb/test/Shell/Commands/command-thread-select.test
new file mode 100644
index 00000000000000..3b48452eea8264
--- /dev/null
+++ b/lldb/test/Shell/Commands/command-thread-select.test
@@ -0,0 +1,17 @@
+# RUN: %clang_host -g %S/Inputs/main.c -o %t
+# RUN: %lldb %t -s %s -o exit | FileCheck %s
+
+b main
+# CHECK-LABEL: b main
+# CHECK: Breakpoint 1: where = {{.*}}`main
+
+run
+# CHECK-LABEL: run
+# CHECK: Process {{.*}} stopped
+# CHECK: stop reason = breakpoint 1
+# CHECK:   frame #0: {{.*}}`main at main.c
+
+thread select 1
+# CHECK-LABEL: thread select 1
+# CHECK: stop reason = breakpoint 1
+# CHECK:   frame #0: {{.*}}`main at main.c
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 781098d389167b..02c4bddf21af89 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -347,13 +347,7 @@ option(LLVM_ENABLE_LIBPFM "Use libpfm for performance counters if available." ON
 
 option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 
-if(CMAKE_SYSTEM_NAME STREQUAL Windows)
-  set(zlib_DEFAULT "OFF")
-else()
-  set(zlib_DEFAULT "ON")
-endif()
-
-set(LLVM_ENABLE_ZLIB "${zlib_DEFAULT}" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
+option(LLVM_ENABLE_ZLIB "Use zlib for compression/decompression if available." ON)
 
 set(LLVM_Z3_INSTALL_DIR "" CACHE STRING "Install directory of the Z3 solver.")
 
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index fc66dbfcbe7a70..f758366bc79d4a 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -56,6 +56,7 @@ check_include_file(sys/types.h HAVE_SYS_TYPES_H)
 check_include_file(termios.h HAVE_TERMIOS_H)
 check_include_file(unistd.h HAVE_UNISTD_H)
 check_include_file(valgrind/valgrind.h HAVE_VALGRIND_VALGRIND_H)
+check_include_file(zlib.h HAVE_ZLIB_H)
 check_include_file(fenv.h HAVE_FENV_H)
 check_symbol_exists(FE_ALL_EXCEPT "fenv.h" HAVE_DECL_FE_ALL_EXCEPT)
 check_symbol_exists(FE_INEXACT "fenv.h" HAVE_DECL_FE_INEXACT)
@@ -117,6 +118,19 @@ endif()
 # Don't look for these libraries if we're using MSan, since uninstrumented third
 # party code may call MSan interceptors like strlen, leading to false positives.
 if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
+  set(HAVE_LIBZ 0)
+  if(LLVM_ENABLE_ZLIB)
+    foreach(library z zlib_static zlib)
+      string(TOUPPER ${library} library_suffix)
+      check_library_exists(${library} compress2 "" HAVE_LIBZ_${library_suffix})
+      if(HAVE_LIBZ_${library_suffix})
+        set(HAVE_LIBZ 1)
+        set(ZLIB_LIBRARIES "${library}")
+        break()
+      endif()
+    endforeach()
+  endif()
+
   # Don't look for these libraries on Windows.
   if (NOT PURE_WINDOWS)
     # Skip libedit if using ASan as it contains memory leaks.
@@ -501,21 +515,10 @@ else( LLVM_ENABLE_THREADS )
   message(STATUS "Threads disabled.")
 endif()
 
-if(LLVM_ENABLE_ZLIB)
-  if(LLVM_ENABLE_ZLIB STREQUAL FORCE_ON)
-    find_package(ZLIB REQUIRED)
-  else()
-    find_package(ZLIB)
-  endif()
-
-  if(ZLIB_FOUND)
-    set(LLVM_ENABLE_ZLIB "YES" CACHE STRING
-      "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON"
-      FORCE)
-  else()
-    set(LLVM_ENABLE_ZLIB "NO" CACHE STRING
-      "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON"
-      FORCE)
+if (LLVM_ENABLE_ZLIB )
+  # Check if zlib is available in the system.
+  if ( NOT HAVE_ZLIB_H OR NOT HAVE_LIBZ )
+    set(LLVM_ENABLE_ZLIB 0)
   endif()
 endif()
 
diff --git a/llvm/include/llvm/Analysis/LoopNestAnalysis.h b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
new file mode 100644
index 00000000000000..5b2ec3a265364d
--- /dev/null
+++ b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
@@ -0,0 +1,162 @@
+//===- llvm/Analysis/LoopNestAnalysis.h -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the interface for the loop nest analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_LOOPNESTANALYSIS_H
+#define LLVM_ANALYSIS_LOOPNESTANALYSIS_H
+
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+using LoopVectorTy = SmallVector<Loop *, 8>;
+
+/// This class represents a loop nest and can be used to query its properties.
+class LoopNest {
+public:
+  /// Construct a loop nest rooted by loop \p Root.
+  LoopNest(Loop &Root, ScalarEvolution &SE);
+
+  LoopNest() = delete;
+  LoopNest &operator=(const LoopNest &) = delete;
+
+  /// Construct a LoopNest object.
+  static std::unique_ptr<LoopNest> getLoopNest(Loop &Root, ScalarEvolution &SE);
+
+  /// Return true if the given loops \p OuterLoop and \p InnerLoop are
+  /// perfectly nested with respect to each other, and false otherwise.
+  /// Example:
+  /// \code
+  ///   for(i)
+  ///     for(j)
+  ///       for(k)
+  /// \endcode
+  /// arePerfectlyNested(loop_i, loop_j, SE) would return true.
+  /// arePerfectlyNested(loop_j, loop_k, SE) would return true.
+  /// arePerfectlyNested(loop_i, loop_k, SE) would return false.
+  static bool arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
+                                 ScalarEvolution &SE);
+
+  /// Return the maximum nesting depth of the loop nest rooted by loop \p Root.
+  /// For example given the loop nest:
+  /// \code
+  ///   for(i)     // loop at level 1 and Root of the nest
+  ///     for(j)   // loop at level 2
+  ///       <code>
+  ///       for(k) // loop at level 3
+  /// \endcode
+  /// getMaxPerfectDepth(Loop_i) would return 2.
+  static unsigned getMaxPerfectDepth(const Loop &Root, ScalarEvolution &SE);
+
+  /// Return the outermost loop in the loop nest.
+  Loop &getOutermostLoop() const { return *Loops.front(); }
+
+  /// Return the innermost loop in the loop nest if the nest has only one
+  /// innermost loop, and a nullptr otherwise.
+  /// Note: the innermost loop returned is not necessarily perfectly nested.
+  Loop *getInnermostLoop() const {
+    if (Loops.size() == 1)
+      return Loops.back();
+
+    // The loops in the 'Loops' vector have been collected in breadth first
+    // order, therefore if the last 2 loops in it have the same nesting depth
+    // there isn't a unique innermost loop in the nest.
+    Loop *LastLoop = Loops.back();
+    auto SecondLastLoopIter = ++Loops.rbegin();
+    return (LastLoop->getLoopDepth() == (*SecondLastLoopIter)->getLoopDepth())
+               ? nullptr
+               : LastLoop;
+  }
+
+  /// Return the loop at the given \p Index.
+  Loop *getLoop(unsigned Index) const {
+    assert(Index < Loops.size() && "Index is out of bounds");
+    return Loops[Index];
+  }
+
+  /// Return the number of loops in the nest.
+  size_t getNumLoops() const { return Loops.size(); }
+
+  /// Get the loops in the nest.
+  ArrayRef<Loop *> getLoops() const { return Loops; }
+
+  /// Retrieve a vector of perfect loop nests contained in the current loop
+  /// nest. For example, given the following  nest containing 4 loops, this
+  /// member function would return {{L1,L2},{L3,L4}}.
+  /// \code
+  ///   for(i) // L1
+  ///     for(j) // L2
+  ///       <code>
+  ///       for(k) // L3
+  ///         for(l) // L4
+  /// \endcode
+  SmallVector<LoopVectorTy, 4> getPerfectLoops(ScalarEvolution &SE) const;
+
+  /// Return the loop nest depth (i.e. the loop depth of the 'deepest' loop)
+  /// For example given the loop nest:
+  /// \code
+  ///   for(i)      // loop at level 1 and Root of the nest
+  ///     for(j1)   // loop at level 2
+  ///       for(k)  // loop at level 3
+  ///     for(j2)   // loop at level 2
+  /// \endcode
+  /// getNestDepth() would return 3.
+  unsigned getNestDepth() const {
+    int NestDepth =
+        Loops.back()->getLoopDepth() - Loops.front()->getLoopDepth() + 1;
+    assert(NestDepth > 0 && "Expecting NestDepth to be at least 1");
+    return NestDepth;
+  }
+
+  /// Return the maximum perfect nesting depth.
+  unsigned getMaxPerfectDepth() const { return MaxPerfectDepth; }
+
+  /// Return true if all loops in the loop nest are in simplify form.
+  bool areAllLoopsSimplifyForm() const {
+    return llvm::all_of(Loops,
+                        [](const Loop *L) { return L->isLoopSimplifyForm(); });
+  }
+
+protected:
+  const unsigned MaxPerfectDepth; // maximum perfect nesting depth level.
+  LoopVectorTy Loops; // the loops in the nest (in breadth first order).
+};
+
+raw_ostream &operator<<(raw_ostream &, const LoopNest &);
+
+/// This analysis provides information for a loop nest. The analysis runs on
+/// demand and can be initiated via AM.getResult<LoopNestAnalysis>.
+class LoopNestAnalysis : public AnalysisInfoMixin<LoopNestAnalysis> {
+  friend AnalysisInfoMixin<LoopNestAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = LoopNest;
+  Result run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR);
+};
+
+/// Printer pass for the \c LoopNest results.
+class LoopNestPrinterPass : public PassInfoMixin<LoopNestPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit LoopNestPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_LOOPNESTANALYSIS_H
diff --git a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
index 6693e40ccf223e..f90dcf604e9b09 100644
--- a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -120,6 +120,11 @@ class ProfileSummaryInfo {
   bool isFunctionHotInCallGraphNthPercentile(int PercentileCutoff,
                                              const Function *F,
                                              BlockFrequencyInfo &BFI);
+  /// Returns true if \p F contains cold code with regard to a given cold
+  /// percentile cutoff value.
+  bool isFunctionColdInCallGraphNthPercentile(int PercentileCutoff,
+                                              const Function *F,
+                                              BlockFrequencyInfo &BFI);
   /// Returns true if count \p C is considered hot.
   bool isHotCount(uint64_t C);
   /// Returns true if count \p C is considered cold.
@@ -127,6 +132,9 @@ class ProfileSummaryInfo {
   /// Returns true if count \p C is considered hot with regard to a given
   /// hot percentile cutoff value.
   bool isHotCountNthPercentile(int PercentileCutoff, uint64_t C);
+  /// Returns true if count \p C is considered cold with regard to a given
+  /// cold percentile cutoff value.
+  bool isColdCountNthPercentile(int PercentileCutoff, uint64_t C);
   /// Returns true if BasicBlock \p BB is considered hot.
   bool isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI);
   /// Returns true if BasicBlock \p BB is considered cold.
@@ -135,6 +143,10 @@ class ProfileSummaryInfo {
   /// hot percentile cutoff value.
   bool isHotBlockNthPercentile(int PercentileCutoff,
                                const BasicBlock *BB, BlockFrequencyInfo *BFI);
+  /// Returns true if BasicBlock \p BB is considered cold with regard to a given
+  /// cold percentile cutoff value.
+  bool isColdBlockNthPercentile(int PercentileCutoff,
+                                const BasicBlock *BB, BlockFrequencyInfo *BFI);
   /// Returns true if CallSite \p CS is considered hot.
   bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI);
   /// Returns true if Callsite \p CS is considered cold.
@@ -153,6 +165,17 @@ class ProfileSummaryInfo {
   uint64_t getColdCountThreshold() {
     return ColdCountThreshold ? ColdCountThreshold.getValue() : 0;
   }
+
+ private:
+  template<bool isHot>
+  bool isFunctionHotOrColdInCallGraphNthPercentile(int PercentileCutoff,
+                                                   const Function *F,
+                                                   BlockFrequencyInfo &BFI);
+  template<bool isHot>
+  bool isHotOrColdCountNthPercentile(int PercentileCutoff, uint64_t C);
+  template<bool isHot>
+  bool isHotOrColdBlockNthPercentile(int PercentileCutoff, const BasicBlock *BB,
+                                     BlockFrequencyInfo *BFI);
 };
 
 /// An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8a1e720d77f6bb..2968a5f37a4617 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -494,6 +494,8 @@ class TargetTransformInfo {
     bool UpperBound;
     /// Allow peeling off loop iterations.
     bool AllowPeeling;
+    /// Allow peeling off loop iterations for loop nests.
+    bool AllowLoopNestsPeeling;
     /// Allow unrolling of all the iterations of the runtime loop remainder.
     bool UnrollRemainder;
     /// Allow unroll and jam. Used to enable unroll and jam for the target.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9583e2b718e5ba..0552420c3c3362 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2754,6 +2754,13 @@ class TargetLoweringBase {
   /// The default implementation just freezes the set of reserved registers.
   virtual void finalizeLowering(MachineFunction &MF) const;
 
+  //===----------------------------------------------------------------------===//
+  //  GlobalISel Hooks
+  //===----------------------------------------------------------------------===//
+  /// Check whether or not \p MI needs to be moved close to its uses.
+  virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const;
+
+
 private:
   const TargetMachine &TM;
 
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index db170ae5d62d27..1a38bc15ab9d66 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -109,6 +109,9 @@
 /* Define to 1 if you have the `pthread_setname_np' function. */
 #cmakedefine HAVE_PTHREAD_SETNAME_NP ${HAVE_PTHREAD_SETNAME_NP}
 
+/* Define to 1 if you have the `z' library (-lz). */
+#cmakedefine HAVE_LIBZ ${HAVE_LIBZ}
+
 /* Define to 1 if you have the <link.h> header file. */
 #cmakedefine HAVE_LINK_H ${HAVE_LINK_H}
 
@@ -223,6 +226,9 @@
 /* Define to 1 if you have the <valgrind/valgrind.h> header file. */
 #cmakedefine HAVE_VALGRIND_VALGRIND_H ${HAVE_VALGRIND_VALGRIND_H}
 
+/* Define to 1 if you have the <zlib.h> header file. */
+#cmakedefine HAVE_ZLIB_H ${HAVE_ZLIB_H}
+
 /* Have host's _alloca */
 #cmakedefine HAVE__ALLOCA ${HAVE__ALLOCA}
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
index 7880bcdf688132..e666c82bca0804 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
@@ -39,7 +39,10 @@ class DWARFDebugMacro {
     };
   };
 
-  using MacroList = SmallVector<Entry, 4>;
+  struct MacroList {
+    SmallVector<Entry, 4> Macros;
+    uint64_t Offset;
+  };
 
   /// A list of all the macro entries in the debug_macinfo section.
   std::vector<MacroList> MacroLists;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 84f4ae7599fd0a..20e5b95a827aea 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -91,6 +91,7 @@ __OMP_DIRECTIVE_EXT(parallel_master_taskloop, "parallel master taskloop")
 __OMP_DIRECTIVE_EXT(master_taskloop_simd, "master taskloop simd")
 __OMP_DIRECTIVE_EXT(parallel_master_taskloop_simd,
                     "parallel master taskloop simd")
+__OMP_DIRECTIVE(depobj)
 
 // Has to be the last because Clang implicitly expects it to be.
 __OMP_DIRECTIVE(unknown)
@@ -130,7 +131,7 @@ __OMP_TYPE(Int32Ptr)
 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)
 #endif
 
-#define __OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) 						  \
+#define __OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                           \
   OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)
 
 __OMP_ARRAY_TYPE(KmpCriticalName, Int32, 8)
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 56874ecf9d1e67..8259df101178da 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -833,8 +833,7 @@ struct TypeTestResolution {
     Single,    ///< Single element (last example in "Short Inline Bit Vectors")
     AllOnes,   ///< All-ones bit vector ("Eliminating Bit Vector Checks for
                ///  All-Ones Bit Vectors")
-    Unknown,   ///< Unknown (analysis not performed, don't lower)
-  } TheKind = Unknown;
+  } TheKind = Unsat;
 
   /// Range of size-1 expressed as a bit width. For example, if the size is in
   /// range [1,256], this number will be 8. This helps generate the most compact
@@ -1028,7 +1027,7 @@ class ModuleSummaryIndex {
   // in the way some record are interpreted, like flags for instance.
   // Note that incrementing this may require changes in both BitcodeReader.cpp
   // and BitcodeWriter.cpp.
-  static constexpr uint64_t BitcodeSummaryVersion = 9;
+  static constexpr uint64_t BitcodeSummaryVersion = 8;
 
   // Regular LTO module name for ASM writer
   static constexpr const char *getRegularLTOModuleName() {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index e51ec67b4c914f..7dcb455274f891 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -17,7 +17,6 @@ namespace yaml {
 
 template <> struct ScalarEnumerationTraits<TypeTestResolution::Kind> {
   static void enumeration(IO &io, TypeTestResolution::Kind &value) {
-    io.enumCase(value, "Unknown", TypeTestResolution::Unknown);
     io.enumCase(value, "Unsat", TypeTestResolution::Unsat);
     io.enumCase(value, "ByteArray", TypeTestResolution::ByteArray);
     io.enumCase(value, "Inline", TypeTestResolution::Inline);
diff --git a/llvm/include/llvm/MC/LaneBitmask.h b/llvm/include/llvm/MC/LaneBitmask.h
index d5f69287a265b0..b070bea3201cfe 100644
--- a/llvm/include/llvm/MC/LaneBitmask.h
+++ b/llvm/include/llvm/MC/LaneBitmask.h
@@ -38,9 +38,9 @@ namespace llvm {
 
   struct LaneBitmask {
     // When changing the underlying type, change the format string as well.
-    using Type = unsigned;
+    using Type = uint64_t;
     enum : unsigned { BitWidth = 8*sizeof(Type) };
-    constexpr static const char *const FormatStr = "%08X";
+    constexpr static const char *const FormatStr = "%016lX";
 
     constexpr LaneBitmask() = default;
     explicit constexpr LaneBitmask(Type V) : Mask(V) {}
@@ -76,7 +76,7 @@ namespace llvm {
       return countPopulation(Mask);
     }
     unsigned getHighestLane() const {
-      return Log2_32(Mask);
+      return Log2_64(Mask);
     }
 
     static constexpr LaneBitmask getNone() { return LaneBitmask(0); }
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index caa392a41b2be6..b57439f02ca550 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -195,6 +195,7 @@ class MCAssembler {
   bool relaxFragment(MCAsmLayout &Layout, MCFragment &F);
   bool relaxInstruction(MCAsmLayout &Layout, MCRelaxableFragment &IF);
   bool relaxLEB(MCAsmLayout &Layout, MCLEBFragment &IF);
+  bool relaxBoundaryAlign(MCAsmLayout &Layout, MCBoundaryAlignFragment &BF);
   bool relaxDwarfLineAddr(MCAsmLayout &Layout, MCDwarfLineAddrFragment &DF);
   bool relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
                                    MCDwarfCallFrameFragment &DF);
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index 610e924f7846c3..e052098611a999 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -528,6 +528,9 @@ class MCBoundaryAlignFragment : public MCFragment {
   bool Fused : 1;
   /// Flag to indicate whether NOPs should be emitted.
   bool EmitNops : 1;
+  /// The size of the fragment.  The size is lazily set during relaxation, and
+  /// is not meaningful before that.
+  uint64_t Size = 0;
 
 public:
   MCBoundaryAlignFragment(Align AlignBoundary = Align(1), bool Fused = false,
@@ -535,6 +538,9 @@ class MCBoundaryAlignFragment : public MCFragment {
       : MCFragment(FT_BoundaryAlign, false, Sec), AlignBoundary(AlignBoundary),
         Fused(Fused), EmitNops(EmitNops) {}
 
+  uint64_t getSize() const { return Size; }
+  void setSize(uint64_t Value) { Size = Value; }
+
   Align getAlignment() const { return AlignBoundary; }
   void setAlignment(Align Value) { AlignBoundary = Value; }
 
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
index 99cc52f54ab90e..97f4c32eb035ac 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
@@ -185,11 +185,17 @@ class BinaryCoverageReader : public CoverageMappingReader {
   std::vector<CounterExpression> Expressions;
   std::vector<CounterMappingRegion> MappingRegions;
 
+  // Used to tie the lifetimes of coverage function records to the lifetime of
+  // this BinaryCoverageReader instance. Needed to support the format change in
+  // D69471, which can split up function records into multiple sections on ELF.
+  std::string FuncRecords;
+
   // Used to tie the lifetimes of decompressed strings to the lifetime of this
   // BinaryCoverageReader instance.
   DecompressedData Decompressed;
 
-  BinaryCoverageReader() = default;
+  BinaryCoverageReader(std::string &&FuncRecords)
+      : FuncRecords(std::move(FuncRecords)) {}
 
 public:
   BinaryCoverageReader(const BinaryCoverageReader &) = delete;
@@ -200,7 +206,7 @@ class BinaryCoverageReader : public CoverageMappingReader {
          SmallVectorImpl<std::unique_ptr<MemoryBuffer>> &ObjectFileBuffers);
 
   static Expected<std::unique_ptr<BinaryCoverageReader>>
-  createCoverageReaderFromBuffer(StringRef Coverage, StringRef FuncRecords,
+  createCoverageReaderFromBuffer(StringRef Coverage, std::string &&FuncRecords,
                                  InstrProfSymtab &&ProfileNames,
                                  uint8_t BytesInAddress,
                                  support::endianness Endian);
diff --git a/llvm/include/llvm/Support/Base64.h b/llvm/include/llvm/Support/Base64.h
index 3f6616633e5fbf..62064a35aa3448 100644
--- a/llvm/include/llvm/Support/Base64.h
+++ b/llvm/include/llvm/Support/Base64.h
@@ -26,20 +26,23 @@ template <class InputBytes> std::string encodeBase64(InputBytes const &Bytes) {
 
   size_t i = 0, j = 0;
   for (size_t n = Bytes.size() / 3 * 3; i < n; i += 3, j += 4) {
-    uint32_t x = (Bytes[i] << 16) | (Bytes[i + 1] << 8) | Bytes[i + 2];
+    uint32_t x = ((unsigned char)Bytes[i] << 16) |
+                 ((unsigned char)Bytes[i + 1] << 8) |
+                 (unsigned char)Bytes[i + 2];
     Buffer[j + 0] = Table[(x >> 18) & 63];
     Buffer[j + 1] = Table[(x >> 12) & 63];
     Buffer[j + 2] = Table[(x >> 6) & 63];
     Buffer[j + 3] = Table[x & 63];
   }
   if (i + 1 == Bytes.size()) {
-    uint32_t x = (Bytes[i] << 16);
+    uint32_t x = ((unsigned char)Bytes[i] << 16);
     Buffer[j + 0] = Table[(x >> 18) & 63];
     Buffer[j + 1] = Table[(x >> 12) & 63];
     Buffer[j + 2] = '=';
     Buffer[j + 3] = '=';
   } else if (i + 2 == Bytes.size()) {
-    uint32_t x = (Bytes[i] << 16) | (Bytes[i + 1] << 8);
+    uint32_t x =
+        ((unsigned char)Bytes[i] << 16) | ((unsigned char)Bytes[i + 1] << 8);
     Buffer[j + 0] = Table[(x >> 18) & 63];
     Buffer[j + 1] = Table[(x >> 12) & 63];
     Buffer[j + 2] = Table[(x >> 6) & 63];
diff --git a/llvm/include/llvm/Transforms/Utils/KnowledgeRetention.h b/llvm/include/llvm/Transforms/Utils/KnowledgeRetention.h
index 27d83373e07456..c3baf8a43c0d80 100644
--- a/llvm/include/llvm/Transforms/Utils/KnowledgeRetention.h
+++ b/llvm/include/llvm/Transforms/Utils/KnowledgeRetention.h
@@ -19,6 +19,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/ADT/DenseMap.h"
 
 namespace llvm {
 
@@ -58,9 +59,41 @@ inline bool hasAttributeInAssume(CallInst &AssumeCI, Value *IsOn,
       AssumeCI, IsOn, Attribute::getNameFromAttrKind(Kind), ArgVal, AQR);
 }
 
-/// TODO: Add an function to create/fill a map from the bundle when users intend
-/// to make many different queries on the same bundles. to be used for example
-/// in the Attributor.
+template<> struct DenseMapInfo<Attribute::AttrKind> {
+  static constexpr auto MaxValue = std::numeric_limits<
+      std::underlying_type<Attribute::AttrKind>::type>::max();
+  static Attribute::AttrKind getEmptyKey() {
+    return static_cast<Attribute::AttrKind>(MaxValue);
+  }
+  static Attribute::AttrKind getTombstoneKey() {
+    return static_cast<Attribute::AttrKind>(MaxValue - 1);
+  }
+  static unsigned getHashValue(Attribute::AttrKind AK) {
+    return hash_combine(AK);
+  }
+  static bool isEqual(Attribute::AttrKind LHS, Attribute::AttrKind RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// The map Key contains the Value on for which the attribute is valid and
+/// the Attribute that is valid for that value.
+/// If the Attribute is not on any value, the Value is nullptr.
+using RetainedKnowledgeKey = std::pair<Value *, Attribute::AttrKind>;
+
+struct MinMax {
+  unsigned Min;
+  unsigned Max;
+};
+
+using RetainedKnowledgeMap = DenseMap<RetainedKnowledgeKey, MinMax>;
+
+/// Insert into the map all the informations contained in the operand bundles of
+/// the llvm.assume. This should be used instead of hasAttributeInAssume when
+/// many queries are going to be made on the same llvm.assume.
+/// String attributes are not inserted in the map.
+/// If the IR changes the map will be outdated.
+void fillMapFromAssume(CallInst &AssumeCI, RetainedKnowledgeMap &Result);
 
 //===----------------------------------------------------------------------===//
 // Utilities for testing
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 3de4318cc7b3b9..15a3be5487ebf8 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 namespace llvm {
 
@@ -426,6 +428,12 @@ void appendReversedLoopsToWorklist(RangeT &&,
 /// already reversed loops in LI.
 /// FIXME: Consider changing the order in LoopInfo.
 void appendLoopsToWorklist(LoopInfo &, SmallPriorityWorklist<Loop *, 4> &);
+
+/// Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+Loop *cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+                LoopInfo *LI, LPPassManager *LPM);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 34140e18677d26..969049f9078217 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -53,6 +53,7 @@ add_llvm_component_library(LLVMAnalysis
   LoopAccessAnalysis.cpp
   LoopAnalysisManager.cpp
   LoopCacheAnalysis.cpp
+  LoopNestAnalysis.cpp
   LoopUnrollAnalyzer.cpp
   LoopInfo.cpp
   LoopPass.cpp
diff --git a/llvm/lib/Analysis/LoopNestAnalysis.cpp b/llvm/lib/Analysis/LoopNestAnalysis.cpp
new file mode 100644
index 00000000000000..61e53de93151aa
--- /dev/null
+++ b/llvm/lib/Analysis/LoopNestAnalysis.cpp
@@ -0,0 +1,296 @@
+//===- LoopNestAnalysis.cpp - Loop Nest Analysis --------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The implementation for the loop nest analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopNestAnalysis.h"
+#include "llvm/ADT/BreadthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loopnest"
+#ifndef NDEBUG
+static const char *VerboseDebug = DEBUG_TYPE "-verbose";
+#endif
+
+/// Determine whether the loops structure violates basic requirements for
+/// perfect nesting:
+///  - the inner loop should be the outer loop's only child
+///  - the outer loop header should 'flow' into the inner loop preheader
+///    or jump around the inner loop to the outer loop latch
+///  - if the inner loop latch exits the inner loop, it should 'flow' into
+///    the outer loop latch.
+/// Returns true if the loop structure satisfies the basic requirements and
+/// false otherwise.
+static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop,
+                                ScalarEvolution &SE);
+
+//===----------------------------------------------------------------------===//
+// LoopNest implementation
+//
+
+LoopNest::LoopNest(Loop &Root, ScalarEvolution &SE)
+    : MaxPerfectDepth(getMaxPerfectDepth(Root, SE)) {
+  for (Loop *L : breadth_first(&Root))
+    Loops.push_back(L);
+}
+
+std::unique_ptr<LoopNest> LoopNest::getLoopNest(Loop &Root,
+                                                ScalarEvolution &SE) {
+  return std::make_unique<LoopNest>(Root, SE);
+}
+
+bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
+                                  ScalarEvolution &SE) {
+  assert(!OuterLoop.getSubLoops().empty() && "Outer loop should have subloops");
+  assert(InnerLoop.getParentLoop() && "Inner loop should have a parent");
+  LLVM_DEBUG(dbgs() << "Checking whether loop '" << OuterLoop.getName()
+                    << "' and '" << InnerLoop.getName()
+                    << "' are perfectly nested.\n");
+
+  // Determine whether the loops structure satisfies the following requirements:
+  //  - the inner loop should be the outer loop's only child
+  //  - the outer loop header should 'flow' into the inner loop preheader
+  //    or jump around the inner loop to the outer loop latch
+  //  - if the inner loop latch exits the inner loop, it should 'flow' into
+  //    the outer loop latch.
+  if (!checkLoopsStructure(OuterLoop, InnerLoop, SE)) {
+    LLVM_DEBUG(dbgs() << "Not perfectly nested: invalid loop structure.\n");
+    return false;
+  }
+
+  // Bail out if we cannot retrieve the outer loop bounds.
+  auto OuterLoopLB = OuterLoop.getBounds(SE);
+  if (OuterLoopLB == None) {
+    LLVM_DEBUG(dbgs() << "Cannot compute loop bounds of OuterLoop: "
+                      << OuterLoop << "\n";);
+    return false;
+  }
+
+  // Identify the outer loop latch comparison instruction.
+  const BasicBlock *Latch = OuterLoop.getLoopLatch();
+  assert(Latch && "Expecting a valid loop latch");
+  const BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
+  assert(BI && BI->isConditional() &&
+         "Expecting loop latch terminator to be a branch instruction");
+
+  const CmpInst *OuterLoopLatchCmp = dyn_cast<CmpInst>(BI->getCondition());
+  DEBUG_WITH_TYPE(
+      VerboseDebug, if (OuterLoopLatchCmp) {
+        dbgs() << "Outer loop latch compare instruction: " << *OuterLoopLatchCmp
+               << "\n";
+      });
+
+  // Identify the inner loop guard instruction.
+  BranchInst *InnerGuard = InnerLoop.getLoopGuardBranch();
+  const CmpInst *InnerLoopGuardCmp =
+      (InnerGuard) ? dyn_cast<CmpInst>(InnerGuard->getCondition()) : nullptr;
+
+  DEBUG_WITH_TYPE(
+      VerboseDebug, if (InnerLoopGuardCmp) {
+        dbgs() << "Inner loop guard compare instruction: " << *InnerLoopGuardCmp
+               << "\n";
+      });
+
+  // Determine whether instructions in a basic block are one of:
+  //  - the inner loop guard comparison
+  //  - the outer loop latch comparison
+  //  - the outer loop induction variable increment
+  //  - a phi node, a cast or a branch
+  auto containsOnlySafeInstructions = [&](const BasicBlock &BB) {
+    return llvm::all_of(BB, [&](const Instruction &I) {
+      bool isAllowed = isSafeToSpeculativelyExecute(&I) || isa<PHINode>(I) ||
+                       isa<BranchInst>(I);
+      if (!isAllowed) {
+        DEBUG_WITH_TYPE(VerboseDebug, {
+          dbgs() << "Instruction: " << I << "\nin basic block: " << BB
+                 << " is considered unsafe.\n";
+        });
+        return false;
+      }
+
+      // The only binary instruction allowed is the outer loop step instruction,
+      // the only comparison instructions allowed are the inner loop guard
+      // compare instruction and the outer loop latch compare instruction.
+      if ((isa<BinaryOperator>(I) && &I != &OuterLoopLB->getStepInst()) ||
+          (isa<CmpInst>(I) && &I != OuterLoopLatchCmp &&
+           &I != InnerLoopGuardCmp)) {
+        DEBUG_WITH_TYPE(VerboseDebug, {
+          dbgs() << "Instruction: " << I << "\nin basic block:" << BB
+                 << "is unsafe.\n";
+        });
+        return false;
+      }
+      return true;
+    });
+  };
+
+  // Check the code surrounding the inner loop for instructions that are deemed
+  // unsafe.
+  const BasicBlock *OuterLoopHeader = OuterLoop.getHeader();
+  const BasicBlock *OuterLoopLatch = OuterLoop.getLoopLatch();
+  const BasicBlock *InnerLoopPreHeader = InnerLoop.getLoopPreheader();
+
+  if (!containsOnlySafeInstructions(*OuterLoopHeader) ||
+      !containsOnlySafeInstructions(*OuterLoopLatch) ||
+      (InnerLoopPreHeader != OuterLoopHeader &&
+       !containsOnlySafeInstructions(*InnerLoopPreHeader)) ||
+      !containsOnlySafeInstructions(*InnerLoop.getExitBlock())) {
+    LLVM_DEBUG(dbgs() << "Not perfectly nested: code surrounding inner loop is "
+                         "unsafe\n";);
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Loop '" << OuterLoop.getName() << "' and '"
+                    << InnerLoop.getName() << "' are perfectly nested.\n");
+
+  return true;
+}
+
+SmallVector<LoopVectorTy, 4>
+LoopNest::getPerfectLoops(ScalarEvolution &SE) const {
+  SmallVector<LoopVectorTy, 4> LV;
+  LoopVectorTy PerfectNest;
+
+  for (Loop *L : depth_first(const_cast<Loop *>(Loops.front()))) {
+    if (PerfectNest.empty())
+      PerfectNest.push_back(L);
+
+    auto &SubLoops = L->getSubLoops();
+    if (SubLoops.size() == 1 && arePerfectlyNested(*L, *SubLoops.front(), SE)) {
+      PerfectNest.push_back(SubLoops.front());
+    } else {
+      LV.push_back(PerfectNest);
+      PerfectNest.clear();
+    }
+  }
+
+  return LV;
+}
+
+unsigned LoopNest::getMaxPerfectDepth(const Loop &Root, ScalarEvolution &SE) {
+  LLVM_DEBUG(dbgs() << "Get maximum perfect depth of loop nest rooted by loop '"
+                    << Root.getName() << "'\n");
+
+  const Loop *CurrentLoop = &Root;
+  const auto *SubLoops = &CurrentLoop->getSubLoops();
+  unsigned CurrentDepth = 1;
+
+  while (SubLoops->size() == 1) {
+    const Loop *InnerLoop = SubLoops->front();
+    if (!arePerfectlyNested(*CurrentLoop, *InnerLoop, SE)) {
+      LLVM_DEBUG({
+        dbgs() << "Not a perfect nest: loop '" << CurrentLoop->getName()
+               << "' is not perfectly nested with loop '"
+               << InnerLoop->getName() << "'\n";
+      });
+      break;
+    }
+
+    CurrentLoop = InnerLoop;
+    SubLoops = &CurrentLoop->getSubLoops();
+    ++CurrentDepth;
+  }
+
+  return CurrentDepth;
+}
+
+static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop,
+                                ScalarEvolution &SE) {
+  // The inner loop must be the only outer loop's child.
+  if ((OuterLoop.getSubLoops().size() != 1) ||
+      (InnerLoop.getParentLoop() != &OuterLoop))
+    return false;
+
+  // We expect loops in normal form which have a preheader, header, latch...
+  if (!OuterLoop.isLoopSimplifyForm() || !InnerLoop.isLoopSimplifyForm())
+    return false;
+
+  const BasicBlock *OuterLoopHeader = OuterLoop.getHeader();
+  const BasicBlock *OuterLoopLatch = OuterLoop.getLoopLatch();
+  const BasicBlock *InnerLoopPreHeader = InnerLoop.getLoopPreheader();
+  const BasicBlock *InnerLoopLatch = InnerLoop.getLoopLatch();
+  const BasicBlock *InnerLoopExit = InnerLoop.getExitBlock();
+
+  // We expect rotated loops. The inner loop should have a single exit block.
+  if (OuterLoop.getExitingBlock() != OuterLoopLatch ||
+      InnerLoop.getExitingBlock() != InnerLoopLatch || !InnerLoopExit)
+    return false;
+
+  // Ensure the only branch that may exist between the loops is the inner loop
+  // guard.
+  if (OuterLoopHeader != InnerLoopPreHeader) {
+    const BranchInst *BI =
+        dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+
+    if (!BI || BI != InnerLoop.getLoopGuardBranch())
+      return false;
+
+    // The successors of the inner loop guard should be the inner loop
+    // preheader and the outer loop latch.
+    for (const BasicBlock *Succ : BI->successors()) {
+      if (Succ == InnerLoopPreHeader)
+        continue;
+      if (Succ == OuterLoopLatch)
+        continue;
+
+      DEBUG_WITH_TYPE(VerboseDebug, {
+        dbgs() << "Inner loop guard successor " << Succ->getName()
+               << " doesn't lead to inner loop preheader or "
+                  "outer loop latch.\n";
+      });
+      return false;
+    }
+  }
+
+  // Ensure the inner loop exit block leads to the outer loop latch.
+  if (InnerLoopExit->getSingleSuccessor() != OuterLoopLatch) {
+    DEBUG_WITH_TYPE(
+        VerboseDebug,
+        dbgs() << "Inner loop exit block " << *InnerLoopExit
+               << " does not directly lead to the outer loop latch.\n";);
+    return false;
+  }
+
+  return true;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const LoopNest &LN) {
+  OS << "IsPerfect=";
+  if (LN.getMaxPerfectDepth() == LN.getNestDepth())
+    OS << "true";
+  else
+    OS << "false";
+  OS << ", Depth=" << LN.getNestDepth();
+  OS << ", OutermostLoop: " << LN.getOutermostLoop().getName();
+  OS << ", Loops: ( ";
+  for (const Loop *L : LN.getLoops())
+    OS << L->getName() << " ";
+  OS << ")";
+
+  return OS;
+}
+
+//===----------------------------------------------------------------------===//
+// LoopNestPrinterPass implementation
+//
+
+PreservedAnalyses LoopNestPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &U) {
+  if (auto LN = LoopNest::getLoopNest(L, AR.SE))
+    OS << *LN << "\n";
+
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index 911d39d9a2637b..678d66f632a8ab 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -195,15 +195,19 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
   return true;
 }
 
-// Like isFunctionHotInCallGraph but for a given cutoff.
-bool ProfileSummaryInfo::isFunctionHotInCallGraphNthPercentile(
+template<bool isHot>
+bool ProfileSummaryInfo::isFunctionHotOrColdInCallGraphNthPercentile(
     int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) {
   if (!F || !computeSummary())
     return false;
-  if (auto FunctionCount = F->getEntryCount())
-    if (isHotCountNthPercentile(PercentileCutoff, FunctionCount.getCount()))
+  if (auto FunctionCount = F->getEntryCount()) {
+    if (isHot &&
+        isHotCountNthPercentile(PercentileCutoff, FunctionCount.getCount()))
       return true;
-
+    if (!isHot &&
+        !isColdCountNthPercentile(PercentileCutoff, FunctionCount.getCount()))
+      return false;
+  }
   if (hasSampleProfile()) {
     uint64_t TotalCallCount = 0;
     for (const auto &BB : *F)
@@ -211,13 +215,31 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraphNthPercentile(
         if (isa<CallInst>(I) || isa<InvokeInst>(I))
           if (auto CallCount = getProfileCount(&I, nullptr))
             TotalCallCount += CallCount.getValue();
-    if (isHotCountNthPercentile(PercentileCutoff, TotalCallCount))
+    if (isHot && isHotCountNthPercentile(PercentileCutoff, TotalCallCount))
       return true;
+    if (!isHot && !isColdCountNthPercentile(PercentileCutoff, TotalCallCount))
+      return false;
   }
-  for (const auto &BB : *F)
-    if (isHotBlockNthPercentile(PercentileCutoff, &BB, &BFI))
+  for (const auto &BB : *F) {
+    if (isHot && isHotBlockNthPercentile(PercentileCutoff, &BB, &BFI))
       return true;
-  return false;
+    if (!isHot && !isColdBlockNthPercentile(PercentileCutoff, &BB, &BFI))
+      return false;
+  }
+  return !isHot;
+}
+
+// Like isFunctionHotInCallGraph but for a given cutoff.
+bool ProfileSummaryInfo::isFunctionHotInCallGraphNthPercentile(
+    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) {
+  return isFunctionHotOrColdInCallGraphNthPercentile<true>(
+      PercentileCutoff, F, BFI);
+}
+
+bool ProfileSummaryInfo::isFunctionColdInCallGraphNthPercentile(
+    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) {
+  return isFunctionHotOrColdInCallGraphNthPercentile<false>(
+      PercentileCutoff, F, BFI);
 }
 
 /// Returns true if the function's entry is a cold. If it returns false, it
@@ -299,9 +321,22 @@ bool ProfileSummaryInfo::isColdCount(uint64_t C) {
   return ColdCountThreshold && C <= ColdCountThreshold.getValue();
 }
 
-bool ProfileSummaryInfo::isHotCountNthPercentile(int PercentileCutoff, uint64_t C) {
+template<bool isHot>
+bool ProfileSummaryInfo::isHotOrColdCountNthPercentile(int PercentileCutoff,
+                                                       uint64_t C) {
   auto CountThreshold = computeThreshold(PercentileCutoff);
-  return CountThreshold && C >= CountThreshold.getValue();
+  if (isHot)
+    return CountThreshold && C >= CountThreshold.getValue();
+  else
+    return CountThreshold && C <= CountThreshold.getValue();
+}
+
+bool ProfileSummaryInfo::isHotCountNthPercentile(int PercentileCutoff, uint64_t C) {
+  return isHotOrColdCountNthPercentile<true>(PercentileCutoff, C);
+}
+
+bool ProfileSummaryInfo::isColdCountNthPercentile(int PercentileCutoff, uint64_t C) {
+  return isHotOrColdCountNthPercentile<false>(PercentileCutoff, C);
 }
 
 uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() {
@@ -327,11 +362,27 @@ bool ProfileSummaryInfo::isColdBlock(const BasicBlock *BB,
   return Count && isColdCount(*Count);
 }
 
+template<bool isHot>
+bool ProfileSummaryInfo::isHotOrColdBlockNthPercentile(int PercentileCutoff,
+                                                       const BasicBlock *BB,
+                                                       BlockFrequencyInfo *BFI) {
+  auto Count = BFI->getBlockProfileCount(BB);
+  if (isHot)
+    return Count && isHotCountNthPercentile(PercentileCutoff, *Count);
+  else
+    return Count && isColdCountNthPercentile(PercentileCutoff, *Count);
+}
+
 bool ProfileSummaryInfo::isHotBlockNthPercentile(int PercentileCutoff,
                                                  const BasicBlock *BB,
                                                  BlockFrequencyInfo *BFI) {
-  auto Count = BFI->getBlockProfileCount(BB);
-  return Count && isHotCountNthPercentile(PercentileCutoff, *Count);
+  return isHotOrColdBlockNthPercentile<true>(PercentileCutoff, BB, BFI);
+}
+
+bool ProfileSummaryInfo::isColdBlockNthPercentile(int PercentileCutoff,
+                                                  const BasicBlock *BB,
+                                                  BlockFrequencyInfo *BFI) {
+  return isHotOrColdBlockNthPercentile<false>(PercentileCutoff, BB, BFI);
 }
 
 bool ProfileSummaryInfo::isHotCallSite(const CallSite &CS,
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index ad74303a784da4..1279e936607ecc 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -7665,9 +7665,6 @@ bool LLParser::ParseTypeTestResolution(TypeTestResolution &TTRes) {
     return true;
 
   switch (Lex.getKind()) {
-  case lltok::kw_unknown:
-    TTRes.TheKind = TypeTestResolution::Unknown;
-    break;
   case lltok::kw_unsat:
     TTRes.TheKind = TypeTestResolution::Unsat;
     break;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index def2dc0e0889ed..134ef74b27047c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1045,6 +1045,8 @@ void DwarfUnit::constructTemplateTypeParameterDIE(
     addType(ParamDIE, TP->getType());
   if (!TP->getName().empty())
     addString(ParamDIE, dwarf::DW_AT_name, TP->getName());
+  if (TP->isDefault() && (DD->getDwarfVersion() >= 5))
+    addFlag(ParamDIE, dwarf::DW_AT_default_value);
 }
 
 void DwarfUnit::constructTemplateValueParameterDIE(
@@ -1057,6 +1059,8 @@ void DwarfUnit::constructTemplateValueParameterDIE(
     addType(ParamDIE, VP->getType());
   if (!VP->getName().empty())
     addString(ParamDIE, dwarf::DW_AT_name, VP->getName());
+  if (VP->isDefault() && (DD->getDwarfVersion() >= 5))
+    addFlag(ParamDIE, dwarf::DW_AT_default_value);
   if (Metadata *Val = VP->getValue()) {
     if (ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Val))
       addConstantValue(ParamDIE, CI, VP->getType());
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index a1adf4ef9820cd..d0dd538f1f525a 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -76,7 +76,7 @@ class MemCmpExpansion {
   IRBuilder<> Builder;
   // Represents the decomposition in blocks of the expansion. For example,
   // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
-  // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
+  // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}.
   struct LoadEntry {
     LoadEntry(unsigned LoadSize, uint64_t Offset)
         : LoadSize(LoadSize), Offset(Offset) {
diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index 1c4a668e5f316c..9aac47ecb35078 100644
--- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 
@@ -40,60 +41,6 @@ void Localizer::init(MachineFunction &MF) {
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction());
 }
 
-bool Localizer::shouldLocalize(const MachineInstr &MI) {
-  // Assuming a spill and reload of a value has a cost of 1 instruction each,
-  // this helper function computes the maximum number of uses we should consider
-  // for remat. E.g. on arm64 global addresses take 2 insts to materialize. We
-  // break even in terms of code size when the original MI has 2 users vs
-  // choosing to potentially spill. Any more than 2 users we we have a net code
-  // size increase. This doesn't take into account register pressure though.
-  auto maxUses = [](unsigned RematCost) {
-    // A cost of 1 means remats are basically free.
-    if (RematCost == 1)
-      return UINT_MAX;
-    if (RematCost == 2)
-      return 2U;
-
-    // Remat is too expensive, only sink if there's one user.
-    if (RematCost > 2)
-      return 1U;
-    llvm_unreachable("Unexpected remat cost");
-  };
-
-  // Helper to walk through uses and terminate if we've reached a limit. Saves
-  // us spending time traversing uses if all we want to know is if it's >= min.
-  auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
-    unsigned NumUses = 0;
-    auto UI = MRI->use_instr_nodbg_begin(Reg), UE = MRI->use_instr_nodbg_end();
-    for (; UI != UE && NumUses < MaxUses; ++UI) {
-      NumUses++;
-    }
-    // If we haven't reached the end yet then there are more than MaxUses users.
-    return UI == UE;
-  };
-
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  // Constants-like instructions should be close to their users.
-  // We don't want long live-ranges for them.
-  case TargetOpcode::G_CONSTANT:
-  case TargetOpcode::G_FCONSTANT:
-  case TargetOpcode::G_FRAME_INDEX:
-  case TargetOpcode::G_INTTOPTR:
-    return true;
-  case TargetOpcode::G_GLOBAL_VALUE: {
-    unsigned RematCost = TTI->getGISelRematGlobalCost();
-    Register Reg = MI.getOperand(0).getReg();
-    unsigned MaxUses = maxUses(RematCost);
-    if (MaxUses == UINT_MAX)
-      return true; // Remats are "free" so always localize.
-    bool B = isUsesAtMost(Reg, MaxUses);
-    return B;
-  }
-  }
-}
-
 void Localizer::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetTransformInfoWrapperPass>();
   getSelectionDAGFallbackAnalysisUsage(AU);
@@ -119,9 +66,10 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
   // we only localize instructions in the entry block here. This might change if
   // we start doing CSE across blocks.
   auto &MBB = MF.front();
+  auto &TL = *MF.getSubtarget().getTargetLowering();
   for (auto RI = MBB.rbegin(), RE = MBB.rend(); RI != RE; ++RI) {
     MachineInstr &MI = *RI;
-    if (!shouldLocalize(MI))
+    if (!TL.shouldLocalize(MI, TTI))
       continue;
     LLVM_DEBUG(dbgs() << "Should localize: " << MI);
     assert(MI.getDesc().getNumDefs() == 1 &&
diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp
index ad54378d9edc6d..4d0c2462b7d3ca 100644
--- a/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -138,7 +138,10 @@ struct LocIndex {
     return (static_cast<uint64_t>(Location) << 32) | Index;
   }
 
-  static LocIndex fromRawInteger(uint64_t ID) {
+  template<typename IntT> static LocIndex fromRawInteger(IntT ID) {
+    static_assert(std::is_unsigned<IntT>::value &&
+                      sizeof(ID) == sizeof(uint64_t),
+                  "Cannot convert raw integer to LocIndex");
     return {static_cast<uint32_t>(ID >> 32), static_cast<uint32_t>(ID)};
   }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 2a220c02613c81..c20c1552377dcc 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -750,10 +750,10 @@ bool MIParser::parseBasicBlockLiveins(MachineBasicBlock &MBB) {
       if (Token.isNot(MIToken::IntegerLiteral) &&
           Token.isNot(MIToken::HexLiteral))
         return error("expected a lane mask");
-      static_assert(sizeof(LaneBitmask::Type) == sizeof(unsigned),
+      static_assert(sizeof(LaneBitmask::Type) == sizeof(uint64_t),
                     "Use correct get-function for lane mask");
       LaneBitmask::Type V;
-      if (getUnsigned(V))
+      if (getUint64(V))
         return error("invalid lane mask value");
       Mask = LaneBitmask(V);
       lex();
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ae11d7c5dfee9a..eafa0974a7357f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3864,33 +3864,18 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // the comparison operands is infinity or negative infinity, convert the
       // condition to a less-awkward <= or >=.
       if (CFP->getValueAPF().isInfinity()) {
-        if (CFP->getValueAPF().isNegative()) {
-          if (Cond == ISD::SETOEQ &&
-              isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLE);
-          if (Cond == ISD::SETUEQ &&
-              isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULE);
-          if (Cond == ISD::SETUNE &&
-              isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGT);
-          if (Cond == ISD::SETONE &&
-              isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGT);
-        } else {
-          if (Cond == ISD::SETOEQ &&
-              isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGE);
-          if (Cond == ISD::SETUEQ &&
-              isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGE);
-          if (Cond == ISD::SETUNE &&
-              isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULT);
-          if (Cond == ISD::SETONE &&
-              isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLT);
+        bool IsNegInf = CFP->getValueAPF().isNegative();
+        ISD::CondCode NewCond = ISD::SETCC_INVALID;
+        switch (Cond) {
+        case ISD::SETOEQ: NewCond = IsNegInf ? ISD::SETOLE : ISD::SETOGE; break;
+        case ISD::SETUEQ: NewCond = IsNegInf ? ISD::SETULE : ISD::SETUGE; break;
+        case ISD::SETUNE: NewCond = IsNegInf ? ISD::SETUGT : ISD::SETULT; break;
+        case ISD::SETONE: NewCond = IsNegInf ? ISD::SETOGT : ISD::SETOLT; break;
+        default: break;
         }
+        if (NewCond != ISD::SETCC_INVALID &&
+            isCondCodeLegal(NewCond, N0.getSimpleValueType()))
+          return DAG.getSetCC(dl, VT, N0, N1, NewCond);
       }
     }
   }
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 95c63d09718cfc..436857d6b2150b 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -2072,3 +2073,64 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
   Flags |= getTargetMMOFlags(AI);
   return Flags;
 }
+
+//===----------------------------------------------------------------------===//
+//  GlobalISel Hooks
+//===----------------------------------------------------------------------===//
+
+bool TargetLoweringBase::shouldLocalize(const MachineInstr &MI,
+                                        const TargetTransformInfo *TTI) const {
+  auto &MF = *MI.getMF();
+  auto &MRI = MF.getRegInfo();
+  // Assuming a spill and reload of a value has a cost of 1 instruction each,
+  // this helper function computes the maximum number of uses we should consider
+  // for remat. E.g. on arm64 global addresses take 2 insts to materialize. We
+  // break even in terms of code size when the original MI has 2 users vs
+  // choosing to potentially spill. Any more than 2 users we we have a net code
+  // size increase. This doesn't take into account register pressure though.
+  auto maxUses = [](unsigned RematCost) {
+    // A cost of 1 means remats are basically free.
+    if (RematCost == 1)
+      return UINT_MAX;
+    if (RematCost == 2)
+      return 2U;
+
+    // Remat is too expensive, only sink if there's one user.
+    if (RematCost > 2)
+      return 1U;
+    llvm_unreachable("Unexpected remat cost");
+  };
+
+  // Helper to walk through uses and terminate if we've reached a limit. Saves
+  // us spending time traversing uses if all we want to know is if it's >= min.
+  auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
+    unsigned NumUses = 0;
+    auto UI = MRI.use_instr_nodbg_begin(Reg), UE = MRI.use_instr_nodbg_end();
+    for (; UI != UE && NumUses < MaxUses; ++UI) {
+      NumUses++;
+    }
+    // If we haven't reached the end yet then there are more than MaxUses users.
+    return UI == UE;
+  };
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  // Constants-like instructions should be close to their users.
+  // We don't want long live-ranges for them.
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_FRAME_INDEX:
+  case TargetOpcode::G_INTTOPTR:
+    return true;
+  case TargetOpcode::G_GLOBAL_VALUE: {
+    unsigned RematCost = TTI->getGISelRematGlobalCost();
+    Register Reg = MI.getOperand(0).getReg();
+    unsigned MaxUses = maxUses(RematCost);
+    if (MaxUses == UINT_MAX)
+      return true; // Remats are "free" so always localize.
+    bool B = isUsesAtMost(Reg, MaxUses);
+    return B;
+  }
+  }
+}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index 8cb259ebc6222b..2e06e14ee3be30 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -18,7 +18,8 @@ using namespace dwarf;
 void DWARFDebugMacro::dump(raw_ostream &OS) const {
   unsigned IndLevel = 0;
   for (const auto &Macros : MacroLists) {
-    for (const Entry &E : Macros) {
+    OS << format("0x%08" PRIx64 ":\n", Macros.Offset);
+    for (const Entry &E : Macros.Macros) {
       // There should not be DW_MACINFO_end_file when IndLevel is Zero. However,
       // this check handles the case of corrupted ".debug_macinfo" section.
       if (IndLevel > 0)
@@ -51,7 +52,6 @@ void DWARFDebugMacro::dump(raw_ostream &OS) const {
       }
       OS << "\n";
     }
-    OS << "\n";
   }
 }
 
@@ -62,15 +62,17 @@ void DWARFDebugMacro::parse(DataExtractor data) {
     if (!M) {
       MacroLists.emplace_back();
       M = &MacroLists.back();
+      M->Offset = Offset;
     }
     // A macro list entry consists of:
-    M->emplace_back();
-    Entry &E = M->back();
+    M->Macros.emplace_back();
+    Entry &E = M->Macros.back();
     // 1. Macinfo type
     E.Type = data.getULEB128(&Offset);
 
     if (E.Type == 0) {
       // Reached end of a ".debug_macinfo" section contribution.
+      M = nullptr;
       continue;
     }
 
diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
index 3a84ac41e86cad..091f1af9d11a40 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
@@ -29,7 +29,13 @@ uint32_t GsymCreator::insertFile(StringRef Path,
                                  llvm::sys::path::Style Style) {
   llvm::StringRef directory = llvm::sys::path::parent_path(Path, Style);
   llvm::StringRef filename = llvm::sys::path::filename(Path, Style);
-  FileEntry FE(insertString(directory), insertString(filename));
+  // We must insert the strings first, then call the FileEntry constructor.
+  // If we inline the insertString() function call into the constructor, the
+  // call order is undefined due to parameter lists not having any ordering
+  // requirements.
+  const uint32_t Dir = insertString(directory);
+  const uint32_t Base = insertString(filename);
+  FileEntry FE(Dir, Base);
 
   std::lock_guard<std::recursive_mutex> Guard(Mutex);
   const auto NextIndex = Files.size();
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 7c0b79fcabd93d..c2added23de80d 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -2780,8 +2780,6 @@ static const char *getWholeProgDevirtResByArgKindName(
 
 static const char *getTTResKindName(TypeTestResolution::Kind K) {
   switch (K) {
-  case TypeTestResolution::Unknown:
-    return "unknown";
   case TypeTestResolution::Unsat:
     return "unsat";
   case TypeTestResolution::ByteArray:
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index b630a2893b4d93..bd8829e92656e3 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -660,12 +660,7 @@ DISubprogram *DISubprogram::getImpl(
 
 bool DISubprogram::describes(const Function *F) const {
   assert(F && "Invalid function");
-  if (F->getSubprogram() == this)
-    return true;
-  StringRef Name = getLinkageName();
-  if (Name.empty())
-    Name = getName();
-  return F->getName() == Name;
+  return F->getSubprogram() == this;
 }
 
 DILexicalBlock *DILexicalBlock::getImpl(LLVMContext &Context, Metadata *Scope,
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index cf0eac90865249..eaabd553e95b48 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2364,8 +2364,7 @@ void Verifier::visitFunction(const Function &F) {
   if (!HasDebugInfo)
     return;
 
-  // Check that all !dbg attachments lead to back to N (or, at least, another
-  // subprogram that describes the same function).
+  // Check that all !dbg attachments lead to back to N.
   //
   // FIXME: Check this incrementally while visiting !dbg attachments.
   // FIXME: Only check when N is the canonical subprogram for F.
@@ -2394,11 +2393,9 @@ void Verifier::visitFunction(const Function &F) {
     if (SP && ((Scope != SP) && !Seen.insert(SP).second))
       return;
 
-    // FIXME: Once N is canonical, check "SP == &N".
     AssertDI(SP->describes(&F),
              "!dbg attachment points at wrong subprogram for function", N, &F,
              &I, DL, Scope, SP);
-    visitMDNode(*SP);
   };
   for (auto &BB : F)
     for (auto &I : BB) {
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index ec57744cf4803d..b749909d7871a1 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -61,8 +61,10 @@ Error Config::addSaveTemps(std::string OutputFileName,
   std::error_code EC;
   ResolutionFile = std::make_unique<raw_fd_ostream>(
       OutputFileName + "resolution.txt", EC, sys::fs::OpenFlags::OF_Text);
-  if (EC)
+  if (EC) {
+    ResolutionFile.reset();
     return errorCodeToError(EC);
+  }
 
   auto setHook = [&](std::string PathSuffix, ModuleHookFn &Hook) {
     // Keep track of the hook provided by the linker, which also needs to run.
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 8582d5143aa8b6..b32c9b5fdfacbc 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -285,43 +285,6 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
   return IsResolved;
 }
 
-/// Check if the branch crosses the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch cross the boundary.
-static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
-                             Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (StartAddr >> Log2(BoundaryAlignment)) !=
-         ((EndAddr - 1) >> Log2(BoundaryAlignment));
-}
-
-/// Check if the branch is against the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch is against the boundary.
-static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
-                              Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
-}
-
-/// Check if the branch needs padding.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch needs padding.
-static bool needPadding(uint64_t StartAddr, uint64_t Size,
-                        Align BoundaryAlignment) {
-  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
-         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
-}
-
 uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
                                           const MCFragment &F) const {
   assert(getBackendPtr() && "Requires assembler backend");
@@ -351,26 +314,8 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   case MCFragment::FT_LEB:
     return cast<MCLEBFragment>(F).getContents().size();
 
-  case MCFragment::FT_BoundaryAlign: {
-    const MCBoundaryAlignFragment &BF = cast<MCBoundaryAlignFragment>(F);
-    // MCBoundaryAlignFragment that doesn't emit NOP should have 0 size.
-    if (!BF.canEmitNops())
-      return 0;
-
-    uint64_t AlignedOffset = Layout.getFragmentOffset(&BF);
-    uint64_t AlignedSize = 0;
-    const MCFragment *F = BF.getNextNode();
-    // If the branch is unfused, it is emitted into one fragment, otherwise it
-    // is emitted into two fragments at most, the next
-    // MCBoundaryAlignFragment(if exists) also marks the end of the branch.
-    for (int I = 0, N = BF.isFused() ? 2 : 1;
-         I != N && !isa<MCBoundaryAlignFragment>(F); ++I, F = F->getNextNode())
-      AlignedSize += computeFragmentSize(Layout, *F);
-    Align BoundaryAlignment = BF.getAlignment();
-    return needPadding(AlignedOffset, AlignedSize, BoundaryAlignment)
-               ? offsetToAlignment(AlignedOffset, BoundaryAlignment)
-               : 0U;
-  }
+  case MCFragment::FT_BoundaryAlign:
+    return cast<MCBoundaryAlignFragment>(F).getSize();
 
   case MCFragment::FT_SymbolId:
     return 4;
@@ -1012,6 +957,72 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
   return OldSize != LF.getContents().size();
 }
 
+/// Check if the branch crosses the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch cross the boundary.
+static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
+                             Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (StartAddr >> Log2(BoundaryAlignment)) !=
+         ((EndAddr - 1) >> Log2(BoundaryAlignment));
+}
+
+/// Check if the branch is against the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch is against the boundary.
+static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
+                              Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
+}
+
+/// Check if the branch needs padding.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch needs padding.
+static bool needPadding(uint64_t StartAddr, uint64_t Size,
+                        Align BoundaryAlignment) {
+  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
+         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
+}
+
+bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
+                                     MCBoundaryAlignFragment &BF) {
+  // The MCBoundaryAlignFragment that doesn't emit NOP should not be relaxed.
+  if (!BF.canEmitNops())
+    return false;
+
+  uint64_t AlignedOffset = Layout.getFragmentOffset(BF.getNextNode());
+  uint64_t AlignedSize = 0;
+  const MCFragment *F = BF.getNextNode();
+  // If the branch is unfused, it is emitted into one fragment, otherwise it is
+  // emitted into two fragments at most, the next MCBoundaryAlignFragment(if
+  // exists) also marks the end of the branch.
+  for (auto i = 0, N = BF.isFused() ? 2 : 1;
+       i != N && !isa<MCBoundaryAlignFragment>(F); ++i, F = F->getNextNode()) {
+    AlignedSize += computeFragmentSize(Layout, *F);
+  }
+  uint64_t OldSize = BF.getSize();
+  AlignedOffset -= OldSize;
+  Align BoundaryAlignment = BF.getAlignment();
+  uint64_t NewSize = needPadding(AlignedOffset, AlignedSize, BoundaryAlignment)
+                         ? offsetToAlignment(AlignedOffset, BoundaryAlignment)
+                         : 0U;
+  if (NewSize == OldSize)
+    return false;
+  BF.setSize(NewSize);
+  Layout.invalidateFragmentsFrom(&BF);
+  return true;
+}
+
 bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
                                      MCDwarfLineAddrFragment &DF) {
   MCContext &Context = Layout.getAssembler().getContext();
@@ -1112,6 +1123,8 @@ bool MCAssembler::relaxFragment(MCAsmLayout &Layout, MCFragment &F) {
                                        cast<MCDwarfCallFrameFragment>(F));
   case MCFragment::FT_LEB:
     return relaxLEB(Layout, cast<MCLEBFragment>(F));
+  case MCFragment::FT_BoundaryAlign:
+    return relaxBoundaryAlign(Layout, cast<MCBoundaryAlignFragment>(F));
   case MCFragment::FT_CVInlineLines:
     return relaxCVInlineLineTable(Layout, cast<MCCVInlineLineTableFragment>(F));
   case MCFragment::FT_CVDefRange:
@@ -1127,11 +1140,11 @@ bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec) {
   MCFragment *FirstRelaxedFragment = nullptr;
 
   // Attempt to relax all the fragments in the section.
-  for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
+  for (MCFragment &Frag : Sec) {
     // Check if this is a fragment that needs relaxation.
-    bool RelaxedFrag = relaxFragment(Layout, *I);
+    bool RelaxedFrag = relaxFragment(Layout, Frag);
     if (RelaxedFrag && !FirstRelaxedFragment)
-      FirstRelaxedFragment = &*I;
+      FirstRelaxedFragment = &Frag;
   }
   if (FirstRelaxedFragment) {
     Layout.invalidateFragmentsFrom(FirstRelaxedFragment);
@@ -1144,8 +1157,7 @@ bool MCAssembler::layoutOnce(MCAsmLayout &Layout) {
   ++stats::RelaxationSteps;
 
   bool WasRelaxed = false;
-  for (iterator it = begin(), ie = end(); it != ie; ++it) {
-    MCSection &Sec = *it;
+  for (MCSection &Sec : *this) {
     while (layoutSectionOnce(Layout, Sec))
       WasRelaxed = true;
   }
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 42ba3b40c51f98..a96b8e86aed3c7 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -431,7 +431,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     else
       OS << " unfused branch)";
     OS << "\n       ";
-    OS << " BoundarySize:" << BF->getAlignment().value();
+    OS << " BoundarySize:" << BF->getAlignment().value()
+       << " Size:" << BF->getSize();
     break;
   }
   case MCFragment::FT_SymbolId: {
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index eb5b3a61fa89c6..e0ef37a9562b92 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
@@ -761,12 +762,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   }
   MPM.addPass(AttributorPass());
 
-  // Lower type metadata and the type.test intrinsic in the ThinLTO
-  // post link pipeline after ICP. This is to enable usage of the type
-  // tests in ICP sequences.
-  if (Phase == ThinLTOPhase::PostLink)
-    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
-
   // Interprocedural constant propagation now that basic cleanup has occurred
   // and prior to optimizing globals.
   // FIXME: This position in the pipeline hasn't been carefully considered in
@@ -1211,9 +1206,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
     // metadata and intrinsics.
     MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
     MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
-    // Run a second time to clean up any type tests left behind by WPD for use
-    // in ICP.
-    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
     return MPM;
   }
 
@@ -1280,10 +1272,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
     // The LowerTypeTestsPass needs to run to lower type metadata and the
     // type.test intrinsics. The pass does nothing if CFI is disabled.
     MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
-    // Run a second time to clean up any type tests left behind by WPD for use
-    // in ICP (which is performed earlier than this in the regular LTO
-    // pipeline).
-    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
     return MPM;
   }
 
@@ -1411,9 +1399,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   // to be run at link time if CFI is enabled. This pass does nothing if
   // CFI is disabled.
   MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
-  // Run a second time to clean up any type tests left behind by WPD for use
-  // in ICP (which is performed earlier than this in the regular LTO pipeline).
-  MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
   // Enable splitting late in the FullLTO post-link pipeline. This is done in
   // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses).
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 12f4f85548f319..056e8833ab83e4 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -325,6 +325,7 @@ LOOP_PASS("unroll-full", LoopFullUnrollPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
 LOOP_PASS("print<ddg>", DDGAnalysisPrinterPass(dbgs()))
 LOOP_PASS("print<ivusers>", IVUsersPrinterPass(dbgs()))
+LOOP_PASS("print<loopnest>", LoopNestPrinterPass(dbgs()))
 LOOP_PASS("print<loop-cache-cost>", LoopCachePrinterPass(dbgs()))
 LOOP_PASS("loop-predication", LoopPredicationPass())
 LOOP_PASS("guard-widening", GuardWideningPass())
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 227b12bea5c55c..b75738bc360ce3 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -776,33 +776,35 @@ static const char *TestingFormatMagic = "llvmcovmtestdata";
 
 Expected<std::unique_ptr<BinaryCoverageReader>>
 BinaryCoverageReader::createCoverageReaderFromBuffer(
-    StringRef Coverage, StringRef FuncRecords, InstrProfSymtab &&ProfileNames,
+    StringRef Coverage, std::string &&FuncRecords, InstrProfSymtab &&ProfileNames,
     uint8_t BytesInAddress, support::endianness Endian) {
-  std::unique_ptr<BinaryCoverageReader> Reader(new BinaryCoverageReader());
+  std::unique_ptr<BinaryCoverageReader> Reader(
+      new BinaryCoverageReader(std::move(FuncRecords)));
   Reader->ProfileNames = std::move(ProfileNames);
+  StringRef FuncRecordsRef = Reader->FuncRecords;
   if (BytesInAddress == 4 && Endian == support::endianness::little) {
     if (Error E =
             readCoverageMappingData<uint32_t, support::endianness::little>(
-                Reader->ProfileNames, Coverage, FuncRecords,
+                Reader->ProfileNames, Coverage, FuncRecordsRef,
                 Reader->MappingRecords, Reader->Filenames,
                 Reader->Decompressed))
       return std::move(E);
   } else if (BytesInAddress == 4 && Endian == support::endianness::big) {
     if (Error E = readCoverageMappingData<uint32_t, support::endianness::big>(
-            Reader->ProfileNames, Coverage, FuncRecords, Reader->MappingRecords,
-            Reader->Filenames, Reader->Decompressed))
+            Reader->ProfileNames, Coverage, FuncRecordsRef,
+            Reader->MappingRecords, Reader->Filenames, Reader->Decompressed))
       return std::move(E);
   } else if (BytesInAddress == 8 && Endian == support::endianness::little) {
     if (Error E =
             readCoverageMappingData<uint64_t, support::endianness::little>(
-                Reader->ProfileNames, Coverage, FuncRecords,
+                Reader->ProfileNames, Coverage, FuncRecordsRef,
                 Reader->MappingRecords, Reader->Filenames,
                 Reader->Decompressed))
       return std::move(E);
   } else if (BytesInAddress == 8 && Endian == support::endianness::big) {
     if (Error E = readCoverageMappingData<uint64_t, support::endianness::big>(
-            Reader->ProfileNames, Coverage, FuncRecords, Reader->MappingRecords,
-            Reader->Filenames, Reader->Decompressed))
+            Reader->ProfileNames, Coverage, FuncRecordsRef,
+            Reader->MappingRecords, Reader->Filenames, Reader->Decompressed))
       return std::move(E);
   } else
     return make_error<CoverageMapError>(coveragemap_error::malformed);
@@ -846,7 +848,10 @@ loadTestingFormat(StringRef Data) {
       CoverageMapping, "", std::move(ProfileNames), BytesInAddress, Endian);
 }
 
-static Expected<SectionRef> lookupSection(ObjectFile &OF, StringRef Name) {
+/// Find all sections that match \p Name. There may be more than one if comdats
+/// are in use, e.g. for the __llvm_covfun section on ELF.
+static Expected<std::vector<SectionRef>> lookupSections(ObjectFile &OF,
+                                                        StringRef Name) {
   // On COFF, the object file section name may end in "$M". This tells the
   // linker to sort these sections between "$A" and "$Z". The linker removes the
   // dollar and everything after it in the final binary. Do the same to match.
@@ -856,14 +861,17 @@ static Expected<SectionRef> lookupSection(ObjectFile &OF, StringRef Name) {
   };
   Name = stripSuffix(Name);
 
+  std::vector<SectionRef> Sections;
   for (const auto &Section : OF.sections()) {
     Expected<StringRef> NameOrErr = Section.getName();
     if (!NameOrErr)
       return NameOrErr.takeError();
     if (stripSuffix(*NameOrErr) == Name)
-      return Section;
+      Sections.push_back(Section);
   }
-  return make_error<CoverageMapError>(coveragemap_error::no_data_found);
+  if (Sections.empty())
+    return make_error<CoverageMapError>(coveragemap_error::no_data_found);
+  return Sections;
 }
 
 static Expected<std::unique_ptr<BinaryCoverageReader>>
@@ -895,41 +903,51 @@ loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch) {
   // Look for the sections that we are interested in.
   auto ObjFormat = OF->getTripleObjectFormat();
   auto NamesSection =
-      lookupSection(*OF, getInstrProfSectionName(IPSK_name, ObjFormat,
+      lookupSections(*OF, getInstrProfSectionName(IPSK_name, ObjFormat,
                                                  /*AddSegmentInfo=*/false));
   if (auto E = NamesSection.takeError())
     return std::move(E);
   auto CoverageSection =
-      lookupSection(*OF, getInstrProfSectionName(IPSK_covmap, ObjFormat,
-                                                 /*AddSegmentInfo=*/false));
+      lookupSections(*OF, getInstrProfSectionName(IPSK_covmap, ObjFormat,
+                                                  /*AddSegmentInfo=*/false));
   if (auto E = CoverageSection.takeError())
     return std::move(E);
-  auto CoverageMappingOrErr = CoverageSection->getContents();
+  std::vector<SectionRef> CoverageSectionRefs = *CoverageSection;
+  if (CoverageSectionRefs.size() != 1)
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+  auto CoverageMappingOrErr = CoverageSectionRefs.back().getContents();
   if (!CoverageMappingOrErr)
     return CoverageMappingOrErr.takeError();
   StringRef CoverageMapping = CoverageMappingOrErr.get();
 
   InstrProfSymtab ProfileNames;
-  if (Error E = ProfileNames.create(*NamesSection))
+  std::vector<SectionRef> NamesSectionRefs = *NamesSection;
+  if (NamesSectionRefs.size() != 1)
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+  if (Error E = ProfileNames.create(NamesSectionRefs.back()))
     return std::move(E);
 
   // Look for the coverage records section (Version4 only).
-  StringRef FuncRecords;
-  auto CoverageRecordsSection =
-      lookupSection(*OF, getInstrProfSectionName(IPSK_covfun, ObjFormat,
-                                                 /*AddSegmentInfo=*/false));
-  if (auto E = CoverageRecordsSection.takeError())
+  std::string FuncRecords;
+  auto CoverageRecordsSections =
+      lookupSections(*OF, getInstrProfSectionName(IPSK_covfun, ObjFormat,
+                                                  /*AddSegmentInfo=*/false));
+  if (auto E = CoverageRecordsSections.takeError())
     consumeError(std::move(E));
   else {
-    auto CoverageRecordsOrErr = CoverageRecordsSection->getContents();
-    if (!CoverageRecordsOrErr)
-      return CoverageRecordsOrErr.takeError();
-    FuncRecords = CoverageRecordsOrErr.get();
+    for (SectionRef Section : *CoverageRecordsSections) {
+      auto CoverageRecordsOrErr = Section.getContents();
+      if (!CoverageRecordsOrErr)
+        return CoverageRecordsOrErr.takeError();
+      FuncRecords += CoverageRecordsOrErr.get();
+      while (FuncRecords.size() % 8 != 0)
+        FuncRecords += '\0';
+    }
   }
 
   return BinaryCoverageReader::createCoverageReaderFromBuffer(
-      CoverageMapping, FuncRecords, std::move(ProfileNames), BytesInAddress,
-      Endian);
+      CoverageMapping, std::move(FuncRecords), std::move(ProfileNames),
+      BytesInAddress, Endian);
 }
 
 Expected<std::vector<std::unique_ptr<BinaryCoverageReader>>>
diff --git a/llvm/lib/Support/APSInt.cpp b/llvm/lib/Support/APSInt.cpp
index 7c48880f96eac5..6805e06df33308 100644
--- a/llvm/lib/Support/APSInt.cpp
+++ b/llvm/lib/Support/APSInt.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringRef.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 77b507d20a50eb..75a62f45da3669 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -1,7 +1,7 @@
-if(LLVM_ENABLE_ZLIB)
-  set(imported_libs ZLIB::ZLIB)
+set(system_libs)
+if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ )
+  set(system_libs ${system_libs} ${ZLIB_LIBRARIES})
 endif()
-
 if( MSVC OR MINGW )
   # libuuid required for FOLDERID_Profile usage in lib/Support/Windows/Path.inc.
   # advapi32 required for CryptAcquireContextW in lib/Support/Windows/Path.inc.
@@ -186,31 +186,10 @@ add_llvm_component_library(LLVMSupport
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/ADT
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Support
   ${Backtrace_INCLUDE_DIRS}
-  LINK_LIBS ${system_libs} ${imported_libs} ${delayload_flags} ${Z3_LINK_FILES}
+  LINK_LIBS ${system_libs} ${delayload_flags} ${Z3_LINK_FILES}
   )
 
-set(llvm_system_libs ${system_libs})
-
-if(LLVM_ENABLE_ZLIB)
-  string(TOUPPER ${CMAKE_BUILD_TYPE} build_type)
-  get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION_${build_type})
-  if(NOT zlib_library)
-    get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION)
-  endif()
-  get_filename_component(zlib_library ${zlib_library} NAME)
-  if(CMAKE_STATIC_LIBRARY_PREFIX AND
-     zlib_library MATCHES "^${CMAKE_STATIC_LIBRARY_PREFIX}.*${CMAKE_STATIC_LIBRARY_SUFFIX}$")
-    STRING(REGEX REPLACE "^${CMAKE_STATIC_LIBRARY_PREFIX}" "" zlib_library ${zlib_library})
-    STRING(REGEX REPLACE "${CMAKE_STATIC_LIBRARY_SUFFIX}$" "" zlib_library ${zlib_library})
-  elseif(CMAKE_SHARED_LIBRARY_PREFIX AND
-         zlib_library MATCHES "^${CMAKE_SHARED_LIBRARY_PREFIX}.*${CMAKE_SHARED_LIBRARY_SUFFIX}$")
-    STRING(REGEX REPLACE "^${CMAKE_SHARED_LIBRARY_PREFIX}" "" zlib_library ${zlib_library})
-    STRING(REGEX REPLACE "${CMAKE_SHARED_LIBRARY_SUFFIX}$" "" zlib_library ${zlib_library})
-  endif()
-  set(llvm_system_libs ${llvm_system_libs} "${zlib_library}")
-endif()
-
-set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${llvm_system_libs}")
+set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${system_libs}")
 
 if(LLVM_WITH_Z3)
   target_include_directories(LLVMSupport SYSTEM
diff --git a/llvm/lib/Support/CRC.cpp b/llvm/lib/Support/CRC.cpp
index 2bc668beed3223..7ff09debe3b7c8 100644
--- a/llvm/lib/Support/CRC.cpp
+++ b/llvm/lib/Support/CRC.cpp
@@ -25,7 +25,7 @@
 
 using namespace llvm;
 
-#if !LLVM_ENABLE_ZLIB
+#if LLVM_ENABLE_ZLIB == 0 || !HAVE_ZLIB_H
 
 static const uint32_t CRCTable[256] = {
     0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
diff --git a/llvm/lib/Support/Compression.cpp b/llvm/lib/Support/Compression.cpp
index 4165a2740cd030..97d5ffaadf8273 100644
--- a/llvm/lib/Support/Compression.cpp
+++ b/llvm/lib/Support/Compression.cpp
@@ -17,13 +17,13 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#if LLVM_ENABLE_ZLIB
+#if LLVM_ENABLE_ZLIB == 1 && HAVE_ZLIB_H
 #include <zlib.h>
 #endif
 
 using namespace llvm;
 
-#if LLVM_ENABLE_ZLIB
+#if LLVM_ENABLE_ZLIB == 1 && HAVE_LIBZ
 static Error createError(StringRef Err) {
   return make_error<StringError>(Err, inconvertibleErrorCode());
 }
diff --git a/llvm/lib/Support/FileUtilities.cpp b/llvm/lib/Support/FileUtilities.cpp
index d11fbb54dc0d85..b95f92c86e996a 100644
--- a/llvm/lib/Support/FileUtilities.cpp
+++ b/llvm/lib/Support/FileUtilities.cpp
@@ -318,9 +318,8 @@ llvm::Error llvm::writeFileAtomically(
         atomic_write_error::output_stream_error);
   }
 
-  if (const std::error_code Error =
-          sys::fs::rename(/*from=*/GeneratedUniqPath.c_str(),
-                          /*to=*/FinalPath.str().c_str())) {
+  if (sys::fs::rename(/*from=*/GeneratedUniqPath.c_str(),
+                      /*to=*/FinalPath.str().c_str())) {
     return llvm::make_error<AtomicFileWriteError>(
         atomic_write_error::failed_to_rename_temp_file);
   }
diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
index f9e89f69b528cd..0d61fae223239d 100644
--- a/llvm/lib/Support/FormatVariadic.cpp
+++ b/llvm/lib/Support/FormatVariadic.cpp
@@ -6,6 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FormatVariadic.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/IntEqClasses.cpp b/llvm/lib/Support/IntEqClasses.cpp
index 4a976dcefc65fc..ebb02e6c01e521 100644
--- a/llvm/lib/Support/IntEqClasses.cpp
+++ b/llvm/lib/Support/IntEqClasses.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/IntEqClasses.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/IntervalMap.cpp b/llvm/lib/Support/IntervalMap.cpp
index f15c7c9403c36f..674e0f962fa1b2 100644
--- a/llvm/lib/Support/IntervalMap.cpp
+++ b/llvm/lib/Support/IntervalMap.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/IntervalMap.h"
+#include <cassert>
 
 namespace llvm {
 namespace IntervalMapImpl {
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 8f3f4aa8caeaf2..2f1cff7914bca8 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/KnownBits.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/PrettyStackTrace.cpp b/llvm/lib/Support/PrettyStackTrace.cpp
index bfb238cc853919..30a12f65966aeb 100644
--- a/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/llvm/lib/Support/PrettyStackTrace.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <atomic>
+#include <cassert>
 #include <cstdarg>
 #include <cstdio>
 #include <tuple>
diff --git a/llvm/lib/Support/Regex.cpp b/llvm/lib/Support/Regex.cpp
index 615e48a5df7e87..f065adadc62bb0 100644
--- a/llvm/lib/Support/Regex.cpp
+++ b/llvm/lib/Support/Regex.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include <cassert>
 #include <string>
 
 // Important this comes last because it defines "_REGEX_H_". At least on
diff --git a/llvm/lib/Support/StringPool.cpp b/llvm/lib/Support/StringPool.cpp
index 82351017b8ccac..2746444453897d 100644
--- a/llvm/lib/Support/StringPool.cpp
+++ b/llvm/lib/Support/StringPool.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Support/StringPool.h"
 #include "llvm/ADT/StringRef.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index e09abd24eb5b11..79f31efefb787e 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/TargetParser.h"
+#include <cassert>
 #include <cstring>
 using namespace llvm;
 
diff --git a/llvm/lib/Support/VersionTuple.cpp b/llvm/lib/Support/VersionTuple.cpp
index 60b59424fbb49a..e8265c0d41bb1e 100644
--- a/llvm/lib/Support/VersionTuple.cpp
+++ b/llvm/lib/Support/VersionTuple.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/TableGen/TableGenBackend.cpp b/llvm/lib/TableGen/TableGenBackend.cpp
index e11b28e8cff99c..252f126d2d00ff 100644
--- a/llvm/lib/TableGen/TableGenBackend.cpp
+++ b/llvm/lib/TableGen/TableGenBackend.cpp
@@ -13,6 +13,7 @@
 #include "llvm/TableGen/TableGenBackend.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 472b5e628e19e6..436b26c36d2d9a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6158,6 +6158,14 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
   if (FuncInfo->getLOHRelated().count(&MI))
     return outliner::InstrType::Illegal;
 
+  // We can only outline these if we will tail call the outlined function, or
+  // fix up the CFI offsets. For the sake of safety, don't outline CFI
+  // instructions.
+  //
+  // FIXME: If the proper fixups are implemented, this should be possible.
+  if (MI.isCFIInstruction())
+    return outliner::InstrType::Illegal;
+
   // Don't allow debug values to impact outlining type.
   if (MI.isDebugInstr() || MI.isIndirectDebugValue())
     return outliner::InstrType::Invisible;
diff --git a/llvm/lib/Target/AArch64/AArch64StackOffset.h b/llvm/lib/Target/AArch64/AArch64StackOffset.h
index f95b5dc5246e91..6fa1c744f77e23 100644
--- a/llvm/lib/Target/AArch64/AArch64StackOffset.h
+++ b/llvm/lib/Target/AArch64/AArch64StackOffset.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/TypeSize.h"
+#include <cassert>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index e18f63cce109a8..f36468d56a263c 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -206,10 +206,6 @@ namespace {
     /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
     SmallVector<MachineInstr*, 4> T2JumpTables;
 
-    /// HasFarJump - True if any far jump instruction has been emitted during
-    /// the branch fix up pass.
-    bool HasFarJump;
-
     MachineFunction *MF;
     MachineConstantPool *MCP;
     const ARMBaseInstrInfo *TII;
@@ -270,7 +266,6 @@ namespace {
     bool fixupImmediateBr(ImmBranch &Br);
     bool fixupConditionalBr(ImmBranch &Br);
     bool fixupUnconditionalBr(ImmBranch &Br);
-    bool undoLRSpillRestore();
     bool optimizeThumb2Instructions();
     bool optimizeThumb2Branches();
     bool reorderThumb2JumpTables();
@@ -363,7 +358,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   isThumb1 = AFI->isThumb1OnlyFunction();
   isThumb2 = AFI->isThumb2Function();
 
-  HasFarJump = false;
   bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
@@ -456,11 +450,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // After a while, this might be made debug-only, but it is not expensive.
   verify();
 
-  // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
-  // undo the spill / restore of LR if possible.
-  if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
-    MadeChange |= undoLRSpillRestore();
-
   // Save the mapping between original and cloned constpool entries.
   for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
     for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) {
@@ -1633,7 +1622,6 @@ ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
   BBInfoVector &BBInfo = BBUtils->getBBInfo();
   BBInfo[MBB->getNumber()].Size += 2;
   BBUtils->adjustBBOffsetsAfter(MBB);
-  HasFarJump = true;
   ++NumUBrFixed;
 
   LLVM_DEBUG(dbgs() << "  Changed B to long jump " << *MI);
@@ -1735,34 +1723,6 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   return true;
 }
 
-/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills
-/// LR / restores LR to pc. FIXME: This is done here because it's only possible
-/// to do this if tBfar is not used.
-bool ARMConstantIslands::undoLRSpillRestore() {
-  bool MadeChange = false;
-  for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
-    MachineInstr *MI = PushPopMIs[i];
-    // First two operands are predicates.
-    if (MI->getOpcode() == ARM::tPOP_RET &&
-        MI->getOperand(2).getReg() == ARM::PC &&
-        MI->getNumExplicitOperands() == 3) {
-      // Create the new insn and copy the predicate from the old.
-      BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET))
-          .add(MI->getOperand(0))
-          .add(MI->getOperand(1));
-      MI->eraseFromParent();
-      MadeChange = true;
-    } else if (MI->getOpcode() == ARM::tPUSH &&
-               MI->getOperand(2).getReg() == ARM::LR &&
-               MI->getNumExplicitOperands() == 3) {
-      // Just remove the push.
-      MI->eraseFromParent();
-      MadeChange = true;
-    }
-  }
-  return MadeChange;
-}
-
 bool ARMConstantIslands::optimizeThumb2Instructions() {
   bool MadeChange = false;
 
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 8f1bd3ce514512..b8434735451c3a 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1768,8 +1768,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
     unsigned FnSize = EstimateFunctionSizeInBytes(MF, TII);
     // Force LR to be spilled if the Thumb function size is > 2048. This enables
-    // use of BL to implement far jump. If it turns out that it's not needed
-    // then the branch fix up path will undo it.
+    // use of BL to implement far jump.
     if (FnSize >= (1 << 11)) {
       CanEliminateFrame = false;
       ForceLRSpill = true;
@@ -2120,10 +2119,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     }
   }
 
-  if (ForceLRSpill) {
+  if (ForceLRSpill)
     SavedRegs.set(ARM::LR);
-    AFI->setLRIsSpilledForFarJump(true);
-  }
   AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
 }
 
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index bb136e92329ba1..7adf52e1598fdc 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -58,10 +58,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// emitPrologue.
   bool RestoreSPFromFP = false;
 
-  /// LRSpilledForFarJump - True if the LR register has been for spilled to
-  /// enable far jump.
-  bool LRSpilledForFarJump = false;
-
   /// LRSpilled - True if the LR register has been for spilled for
   /// any reason, so it's legal to emit an ARM::tBfar (i.e. "bl").
   bool LRSpilled = false;
@@ -162,9 +158,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   bool isLRSpilled() const { return LRSpilled; }
   void setLRIsSpilled(bool s) { LRSpilled = s; }
 
-  bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
-  void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
-
   unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
   void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
 
diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index f466c5c053ad54..1c6a5046456e66 100644
--- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -597,8 +597,8 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
 
   // Load low byte.
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(CurDstLoReg, RegState::Define)
-    .addReg(SrcReg, RegState::Define);
+                   .addReg(CurDstLoReg, RegState::Define)
+                   .addReg(SrcReg);
 
   // Push low byte onto stack if necessary.
   if (TmpReg)
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 880688807702d0..9d14eb9b212a05 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -259,6 +259,8 @@ const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE(ASR);
     NODE(LSLLOOP);
     NODE(LSRLOOP);
+    NODE(ROLLOOP);
+    NODE(RORLOOP);
     NODE(ASRLOOP);
     NODE(BRCOND);
     NODE(CMP);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 4a53cc489184d4..5ab000df2db08a 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1565,14 +1565,7 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
     if (BitWidth == 64)
       return Combines[0];
 
-    // It must be i128. I128 is not a legal type, so this part will be
-    // executed during type legalization. We need to generate code that
-    // the default expansion can break up into smaller pieces.
-    SDValue C0 = DAG.getZExtOrTrunc(Combines[0], dl, ResTy);
-    SDValue C1 = DAG.getNode(ISD::SHL, dl, ResTy,
-        DAG.getZExtOrTrunc(Combines[1], dl, ResTy),
-        DAG.getConstant(64, dl, MVT::i32));
-    return DAG.getNode(ISD::OR, dl, ResTy, C0, C1);
+    return DAG.getNode(ISD::BUILD_PAIR, dl, ResTy, Combines);
   }
 
   return Op;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 8c7b8a81889efd..af1451cc470453 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "PPCMCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 9eeccc25e1e6a6..0f43ccf630974a 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -315,9 +315,10 @@ void SystemZFrameLowering::
 processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                     RegScavenger *RS) const {
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  bool BackChain = MF.getFunction().hasFnAttribute("backchain");
 
-  if (!usePackedStack(MF))
-    // Always create the full incoming register save area.
+  if (!usePackedStack(MF) || BackChain)
+    // Create the incoming register save area.
     getOrCreateFramePointerSaveIndex(MF);
 
   // Get the size of our stack frame to be allocated ...
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 51b97d3b6f6a09..739377ed0f952e 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -442,9 +442,9 @@ class SystemZTargetLowering : public TargetLowering {
 
   bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
                             bool MathUsed) const override {
-    // Using overflow ops for overflow checks only should beneficial on
-    // SystemZ.
-    return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
+    // Form add and sub with overflow intrinsics regardless of any extra
+    // users of the math result.
+    return VT == MVT::i32 || VT == MVT::i64;
   }
 
   const char *getTargetNodeName(unsigned Opcode) const override;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 067748fdb1f873..a97f8e95769d0a 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -373,6 +373,27 @@ bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const {
   return true;
 }
 
+/// X86 has certain instructions which enable interrupts exactly one
+/// instruction *after* the instruction which stores to SS.  Return true if the
+/// given instruction has such an interrupt delay slot.
+static bool hasInterruptDelaySlot(const MCInst &Inst) {
+  switch (Inst.getOpcode()) {
+  case X86::POPSS16:
+  case X86::POPSS32:
+  case X86::STI:
+    return true;
+
+  case X86::MOV16sr:
+  case X86::MOV32sr:
+  case X86::MOV64sr:
+  case X86::MOV16sm:
+    if (Inst.getOperand(0).getReg() == X86::SS)
+      return true;
+    break;
+  }
+  return false;
+}
+
 /// Check if the instruction operand needs to be aligned. Padding is disabled
 /// before intruction which may be rewritten by linker(e.g. TLSCALL).
 bool X86AsmBackend::needAlignInst(const MCInst &Inst) const {
@@ -401,7 +422,10 @@ void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
 
   MCFragment *CF = OS.getCurrentFragment();
   bool NeedAlignFused = AlignBranchType & X86::AlignBranchFused;
-  if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
+  if (hasInterruptDelaySlot(PrevInst)) {
+    // If this instruction follows an interrupt enabling instruction with a one
+    // instruction delay, inserting a nop would change behavior.
+  } else if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
     // Macro fusion actually happens and there is no other fragment inserted
     // after the previous instruction. NOP can be emitted in PF to align fused
     // jcc.
@@ -442,7 +466,7 @@ void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
   if (!needAlign(OS))
     return;
   // If the branch is emitted into a MCRelaxableFragment, we can determine the
-  // size of the branch easily in during the process of layout. When the
+  // size of the branch easily in MCAssembler::relaxBoundaryAlign. When the
   // branch is fused, the fused branch(macro fusion pair) must be emitted into
   // two fragments. Or when the branch is unfused, the branch must be emitted
   // into one fragment. The MCRelaxableFragment naturally marks the end of the
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index ed0c050a2b9759..f705f59b4d8f83 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1826,7 +1826,7 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
 
   // There is nothing we can do here unless the mask is removing some bits.
   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
-  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+  if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
 
   // We also need to ensure that mask is a continuous run of bits.
   if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
@@ -1921,7 +1921,7 @@ static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
 
   // There is nothing we can do here unless the mask is removing some bits.
   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
-  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+  if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
 
   MVT VT = N.getSimpleValueType();
   SDLoc DL(N);
@@ -4845,23 +4845,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
-    unsigned Opc, MOpc;
+    unsigned ROpc, MOpc;
     bool isSigned = Opcode == ISD::SDIVREM;
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
-      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
-      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
-      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
-      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+      case MVT::i8:  ROpc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+      case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
+      case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
+      case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
       }
     } else {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
-      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
-      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
-      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
-      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+      case MVT::i8:  ROpc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+      case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+      case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+      case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
       }
     }
 
@@ -4970,7 +4970,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       InFlag =
-        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+        SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0);
     }
 
     // Prevent use of AH in a REX instruction by explicitly copying it to
diff --git a/llvm/lib/TextAPI/MachO/TextStub.cpp b/llvm/lib/TextAPI/MachO/TextStub.cpp
index cdfe7f47ee61a3..5637639b8ef83e 100644
--- a/llvm/lib/TextAPI/MachO/TextStub.cpp
+++ b/llvm/lib/TextAPI/MachO/TextStub.cpp
@@ -959,7 +959,8 @@ template <> struct MappingTraits<const InterfaceFile *> {
 
           for (auto &sym : CurrentSection.WeakSymbols)
             File->addSymbol(SymbolKind::GlobalSymbol, sym,
-                            CurrentSection.Targets);
+                            CurrentSection.Targets, SymbolFlags::WeakDefined);
+
           for (auto &sym : CurrentSection.TlvSymbols)
             File->addSymbol(SymbolKind::GlobalSymbol, sym,
                             CurrentSection.Targets,
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 7c26f156d4c90b..6f38a3123932cd 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -735,9 +735,6 @@ static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL,
 /// replace the call with.
 Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
                                                const TypeIdLowering &TIL) {
-  // Delay lowering if the resolution is currently unknown.
-  if (TIL.TheKind == TypeTestResolution::Unknown)
-    return nullptr;
   if (TIL.TheKind == TypeTestResolution::Unsat)
     return ConstantInt::getFalse(M.getContext());
 
@@ -1046,10 +1043,8 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
 
   TypeIdLowering TIL = importTypeId(TypeIdStr->getString());
   Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL);
-  if (Lowered) {
-    CI->replaceAllUsesWith(Lowered);
-    CI->eraseFromParent();
-  }
+  CI->replaceAllUsesWith(Lowered);
+  CI->eraseFromParent();
 }
 
 // ThinLTO backend: the function F has a jump table entry; update this module
@@ -1172,10 +1167,8 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
     for (CallInst *CI : TIUI.CallSites) {
       ++NumTypeTestCallsLowered;
       Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
-      if (Lowered) {
-        CI->replaceAllUsesWith(Lowered);
-        CI->eraseFromParent();
-      }
+      CI->replaceAllUsesWith(Lowered);
+      CI->eraseFromParent();
     }
   }
 }
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 5b9cf7296df219..d321aa055a19d5 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -504,7 +504,6 @@ void PassManagerBuilder::populateModulePassManager(
       MPM.add(createBarrierNoopPass());
 
     if (PerformThinLTO) {
-      MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
       // Drop available_externally and unreferenced globals. This is necessary
       // with ThinLTO in order to avoid leaving undefined references to dead
       // globals in the object file.
@@ -538,11 +537,9 @@ void PassManagerBuilder::populateModulePassManager(
   // inter-module indirect calls. For that we perform indirect call promotion
   // earlier in the pass pipeline, here before globalopt. Otherwise imported
   // available_externally functions look unreferenced and are removed.
-  if (PerformThinLTO) {
+  if (PerformThinLTO)
     MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
                                                      !PGOSampleUse.empty()));
-    MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
-  }
 
   // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
   // as it will change the CFG too much to make the 2nd profile annotation
@@ -1063,8 +1060,8 @@ void PassManagerBuilder::populateThinLTOPassManager(
     PM.add(createVerifierPass());
 
   if (ImportSummary) {
-    // This pass imports type identifier resolutions for whole-program
-    // devirtualization and CFI. It must run early because other passes may
+    // These passes import type identifier resolutions for whole-program
+    // devirtualization and CFI. They must run early because other passes may
     // disturb the specific instruction patterns that these passes look for,
     // creating dependencies on resolutions that may not appear in the summary.
     //
@@ -1112,9 +1109,6 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
   // link time if CFI is enabled. The pass does nothing if CFI is disabled.
   PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
-  // Run a second time to clean up any type tests left behind by WPD for use
-  // in ICP (which is performed earlier than this in the regular LTO pipeline).
-  PM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
 
   if (OptLevel != 0)
     addLateLTOOptimizationPasses(PM);
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 26beb54c205c50..bbc1433a22e895 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -510,9 +510,7 @@ struct DevirtModule {
 
   bool areRemarksEnabled();
 
-  void
-  scanTypeTestUsers(Function *TypeTestFunc,
-                    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+  void scanTypeTestUsers(Function *TypeTestFunc);
   void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc);
 
   void buildTypeIdentifierMap(
@@ -1668,9 +1666,7 @@ bool DevirtModule::areRemarksEnabled() {
   return false;
 }
 
-void DevirtModule::scanTypeTestUsers(
-    Function *TypeTestFunc,
-    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
+void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc) {
   // Find all virtual calls via a virtual table pointer %p under an assumption
   // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
   // points to a member of the type identifier %md. Group calls by (type ID,
@@ -1690,10 +1686,10 @@ void DevirtModule::scanTypeTestUsers(
     auto &DT = LookupDomTree(*CI->getFunction());
     findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
 
-    Metadata *TypeId =
-        cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata();
     // If we found any, add them to CallSlots.
     if (!Assumes.empty()) {
+      Metadata *TypeId =
+          cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata();
       Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
       for (DevirtCallSite Call : DevirtCalls) {
         // Only add this CallSite if we haven't seen it before. The vtable
@@ -1706,13 +1702,6 @@ void DevirtModule::scanTypeTestUsers(
       }
     }
 
-    // If we have any uses on type metadata, keep the type test assumes for
-    // later analysis. Otherwise remove as they aren't useful, and
-    // LowerTypeTests will think they are Unsat and lower to False, which
-    // breaks any uses on assumes.
-    if (TypeIdMap.count(TypeId))
-      continue;
-
     // We no longer need the assumes or the type test.
     for (auto Assume : Assumes)
       Assume->eraseFromParent();
@@ -1911,13 +1900,8 @@ bool DevirtModule::run() {
       (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
     return false;
 
-  // Rebuild type metadata into a map for easy lookup.
-  std::vector<VTableBits> Bits;
-  DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
-  buildTypeIdentifierMap(Bits, TypeIdMap);
-
   if (TypeTestFunc && AssumeFunc)
-    scanTypeTestUsers(TypeTestFunc, TypeIdMap);
+    scanTypeTestUsers(TypeTestFunc);
 
   if (TypeCheckedLoadFunc)
     scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
@@ -1939,6 +1923,10 @@ bool DevirtModule::run() {
     return true;
   }
 
+  // Rebuild type metadata into a map for easy lookup.
+  std::vector<VTableBits> Bits;
+  DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
+  buildTypeIdentifierMap(Bits, TypeIdMap);
   if (TypeIdMap.empty())
     return true;
 
@@ -1995,18 +1983,14 @@ bool DevirtModule::run() {
     // function implementation at offset S.first.ByteOffset, and add to
     // TargetsForSlot.
     std::vector<VirtualCallTarget> TargetsForSlot;
-    WholeProgramDevirtResolution *Res = nullptr;
-    if (ExportSummary && isa<MDString>(S.first.TypeID) &&
-        TypeIdMap.count(S.first.TypeID))
-      // For any type id used on a global's type metadata, create the type id
-      // summary resolution regardless of whether we can devirtualize, so that
-      // lower type tests knows the type id is not Unsat.
-      Res = &ExportSummary
-                 ->getOrInsertTypeIdSummary(
-                     cast<MDString>(S.first.TypeID)->getString())
-                 .WPDRes[S.first.ByteOffset];
     if (tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
                                   S.first.ByteOffset)) {
+      WholeProgramDevirtResolution *Res = nullptr;
+      if (ExportSummary && isa<MDString>(S.first.TypeID))
+        Res = &ExportSummary
+                   ->getOrInsertTypeIdSummary(
+                       cast<MDString>(S.first.TypeID)->getString())
+                   .WPDRes[S.first.ByteOffset];
 
       if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
         DidVirtualConstProp |=
@@ -2120,14 +2104,11 @@ void DevirtIndex::run() {
     std::vector<ValueInfo> TargetsForSlot;
     auto TidSummary = ExportSummary.getTypeIdCompatibleVtableSummary(S.first.TypeID);
     assert(TidSummary);
-    // Create the type id summary resolution regardlness of whether we can
-    // devirtualize, so that lower type tests knows the type id is used on
-    // a global and not Unsat.
-    WholeProgramDevirtResolution *Res =
-        &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID)
-             .WPDRes[S.first.ByteOffset];
     if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary,
                                   S.first.ByteOffset)) {
+      WholeProgramDevirtResolution *Res =
+          &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID)
+               .WPDRes[S.first.ByteOffset];
 
       if (!trySingleImplDevirt(TargetsForSlot, S.first, S.second, Res,
                                DevirtTargets))
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index d429fc24dc3343..ee436a4b319374 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -988,12 +988,12 @@ namespace {
 bool isHoistableAndSinkableInst(Instruction &I) {
   // Only these instructions are hoistable/sinkable.
   return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
-          isa<FenceInst>(I) || isa<CastInst>(I) ||
-          isa<UnaryOperator>(I) || isa<BinaryOperator>(I) ||
-          isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+          isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) ||
+          isa<BinaryOperator>(I) || isa<SelectInst>(I) ||
+          isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
           isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
           isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
-          isa<InsertValueInst>(I));
+          isa<InsertValueInst>(I) || isa<FreezeInst>(I));
 }
 /// Return true if all of the alias sets within this AST are known not to
 /// contain a Mod, or if MSSA knows thare are no MemoryDefs in the loop.
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 6f060800f760d0..ee4d973b2326e1 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3530,9 +3530,6 @@ static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
   const SCEV *LoopStep = AR->getStepRecurrence(SE);
   if (!isa<SCEVConstant>(LoopStep))
     return false;
-  if (LU.AccessTy.getType()->getScalarSizeInBits() !=
-      LoopStep->getType()->getScalarSizeInBits())
-    return false;
   // Check if a post-indexed load/store can be used.
   if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
       TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 15ab2abbc1a849..a1df49300b9062 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -154,6 +154,10 @@ static cl::opt<bool>
                        cl::desc("Allows loops to be peeled when the dynamic "
                                 "trip count is known to be low."));
 
+static cl::opt<bool> UnrollAllowLoopNestsPeeling(
+    "unroll-allow-loop-nests-peeling", cl::init(false), cl::Hidden,
+    cl::desc("Allows loop nests to be peeled."));
+
 static cl::opt<bool> UnrollUnrollRemainder(
   "unroll-remainder", cl::Hidden,
   cl::desc("Allow the loop remainder to be unrolled."));
@@ -215,6 +219,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.Force = false;
   UP.UpperBound = false;
   UP.AllowPeeling = true;
+  UP.AllowLoopNestsPeeling = false;
   UP.UnrollAndJam = false;
   UP.PeelProfiledIterations = true;
   UP.UnrollAndJamInnerLoopThreshold = 60;
@@ -255,6 +260,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.UpperBound = false;
   if (UnrollAllowPeeling.getNumOccurrences() > 0)
     UP.AllowPeeling = UnrollAllowPeeling;
+  if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
+    UP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
   if (UnrollUnrollRemainder.getNumOccurrences() > 0)
     UP.UnrollRemainder = UnrollUnrollRemainder;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 6889facbd05040..5a8127e465e8c4 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -903,30 +903,6 @@ bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
   return true;
 }
 
-/// Recursively clone the specified loop and all of its children,
-/// mapping the blocks with the specified map.
-static Loop *cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI,
-                       LPPassManager *LPM) {
-  Loop &New = *LI->AllocateLoop();
-  if (PL)
-    PL->addChildLoop(&New);
-  else
-    LI->addTopLevelLoop(&New);
-  LPM->addLoop(New);
-
-  // Add all of the blocks in L to the new loop.
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-       I != E; ++I)
-    if (LI->getLoopFor(*I) == L)
-      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
-
-  // Add all of the subloops to the new loop.
-  for (Loop *I : *L)
-    cloneLoop(I, &New, VM, LI, LPM);
-
-  return &New;
-}
-
 /// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
 /// otherwise branch to FalseDest. Insert the code immediately before OldBranch
 /// and remove (but not erase!) it from the function.
diff --git a/llvm/lib/Transforms/Utils/KnowledgeRetention.cpp b/llvm/lib/Transforms/Utils/KnowledgeRetention.cpp
index f3c9ee42b77f9d..f2f87f9200ed1a 100644
--- a/llvm/lib/Transforms/Utils/KnowledgeRetention.cpp
+++ b/llvm/lib/Transforms/Utils/KnowledgeRetention.cpp
@@ -171,6 +171,18 @@ CallInst *llvm::BuildAssumeFromInst(const Instruction *I, Module *M) {
   return Builder.build();
 }
 
+static bool BundleHasArguement(const CallBase::BundleOpInfo &BOI,
+                               unsigned Idx) {
+  return BOI.End - BOI.Begin > Idx;
+}
+
+static Value *getValueFromBundleOpInfo(IntrinsicInst &Assume,
+                                const CallBase::BundleOpInfo &BOI,
+                                unsigned Idx) {
+  assert(BundleHasArguement(BOI, Idx) && "index out of range");
+  return (Assume.op_begin() + BOI.Begin + Idx)->get();
+}
+
 #ifndef NDEBUG
 
 static bool isExistingAttribute(StringRef Name) {
@@ -219,12 +231,6 @@ bool llvm::hasAttributeInAssume(CallInst &AssumeCI, Value *IsOn,
                             return LHS < BOI.Tag->getKey();
                           }));
 
-  auto getValueFromBundleOpInfo = [&Assume](const CallBase::BundleOpInfo &BOI,
-                                            unsigned Idx) {
-    assert(BOI.End - BOI.Begin > Idx && "index out of range");
-    return (Assume.op_begin() + BOI.Begin + Idx)->get();
-  };
-
   if (Lookup == Assume.bundle_op_info_end() ||
       Lookup->Tag->getKey() != AttrName)
     return false;
@@ -235,7 +241,7 @@ bool llvm::hasAttributeInAssume(CallInst &AssumeCI, Value *IsOn,
       if (Lookup == Assume.bundle_op_info_end() ||
           Lookup->Tag->getKey() != AttrName)
         return false;
-      if (getValueFromBundleOpInfo(*Lookup, BOIE_WasOn) == IsOn)
+      if (getValueFromBundleOpInfo(Assume, *Lookup, BOIE_WasOn) == IsOn)
         break;
       if (AQR == AssumeQuery::Highest &&
           Lookup == Assume.bundle_op_info_begin())
@@ -247,12 +253,41 @@ bool llvm::hasAttributeInAssume(CallInst &AssumeCI, Value *IsOn,
   if (Lookup->End - Lookup->Begin < BOIE_Argument)
     return true;
   if (ArgVal)
-    *ArgVal =
-        cast<ConstantInt>(getValueFromBundleOpInfo(*Lookup, BOIE_Argument))
-            ->getZExtValue();
+    *ArgVal = cast<ConstantInt>(
+                  getValueFromBundleOpInfo(Assume, *Lookup, BOIE_Argument))
+                  ->getZExtValue();
   return true;
 }
 
+void llvm::fillMapFromAssume(CallInst &AssumeCI, RetainedKnowledgeMap &Result) {
+  IntrinsicInst &Assume = cast<IntrinsicInst>(AssumeCI);
+  assert(Assume.getIntrinsicID() == Intrinsic::assume &&
+         "this function is intended to be used on llvm.assume");
+  for (auto &Bundles : Assume.bundle_op_infos()) {
+    std::pair<Value *, Attribute::AttrKind> Key{
+        nullptr, Attribute::getAttrKindFromName(Bundles.Tag->getKey())};
+    if (BundleHasArguement(Bundles, BOIE_WasOn))
+      Key.first = getValueFromBundleOpInfo(Assume, Bundles, BOIE_WasOn);
+
+    if (Key.first == nullptr && Key.second == Attribute::None)
+      continue;
+    if (!BundleHasArguement(Bundles, BOIE_Argument)) {
+      Result[Key] = {0, 0};
+      continue;
+    }
+    unsigned Val = cast<ConstantInt>(
+                       getValueFromBundleOpInfo(Assume, Bundles, BOIE_Argument))
+                       ->getZExtValue();
+    auto Lookup = Result.find(Key);
+    if (Lookup == Result.end()) {
+      Result[Key] = {Val, Val};
+      continue;
+    }
+    Lookup->second.Min = std::min(Val, Lookup->second.Min);
+    Lookup->second.Max = std::max(Val, Lookup->second.Max);
+  }
+}
+
 PreservedAnalyses AssumeBuilderPass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
   for (Instruction &I : instructions(F))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 0fb5ac8ebcc321..9cb73230086c2a 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -75,6 +75,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -1230,24 +1231,6 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
 ///  Dbg Intrinsic utilities
 ///
 
-/// See if there is a dbg.value intrinsic for DIVar before I.
-static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr,
-                              Instruction *I) {
-  // Since we can't guarantee that the original dbg.declare instrinsic
-  // is removed by LowerDbgDeclare(), we need to make sure that we are
-  // not inserting the same dbg.value intrinsic over and over.
-  BasicBlock::InstListType::iterator PrevI(I);
-  if (PrevI != I->getParent()->getInstList().begin()) {
-    --PrevI;
-    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(PrevI))
-      if (DVI->getValue() == I->getOperand(0) &&
-          DVI->getVariable() == DIVar &&
-          DVI->getExpression() == DIExpr)
-        return true;
-  }
-  return false;
-}
-
 /// See if there is a dbg.value intrinsic for DIVar for the PHI node.
 static bool PhiHasDebugValue(DILocalVariable *DIVar,
                              DIExpression *DIExpr,
@@ -1324,13 +1307,11 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
     // know which part) we insert an dbg.value instrinsic to indicate that we
     // know nothing about the variable's content.
     DV = UndefValue::get(DV->getType());
-    if (!LdStHasDebugValue(DIVar, DIExpr, SI))
-      Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
+    Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
     return;
   }
 
-  if (!LdStHasDebugValue(DIVar, DIExpr, SI))
-    Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
+  Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
 }
 
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1341,9 +1322,6 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
   auto *DIExpr = DII->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (LdStHasDebugValue(DIVar, DIExpr, LI))
-    return;
-
   if (!valueCoversEntireFragment(LI->getType(), DII)) {
     // FIXME: If only referring to a part of the variable described by the
     // dbg.declare, then we want to insert a dbg.value for the corresponding
@@ -1410,6 +1388,7 @@ static bool isStructure(AllocaInst *AI) {
 /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
 /// of llvm.dbg.value intrinsics.
 bool llvm::LowerDbgDeclare(Function &F) {
+  bool Changed = false;
   DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
   SmallVector<DbgDeclareInst *, 4> Dbgs;
   for (auto &FI : F)
@@ -1418,7 +1397,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
         Dbgs.push_back(DDI);
 
   if (Dbgs.empty())
-    return false;
+    return Changed;
 
   for (auto &I : Dbgs) {
     DbgDeclareInst *DDI = I;
@@ -1471,8 +1450,14 @@ bool llvm::LowerDbgDeclare(Function &F) {
       }
     }
     DDI->eraseFromParent();
+    Changed = true;
   }
-  return true;
+
+  if (Changed)
+  for (BasicBlock &BB : F)
+    RemoveRedundantDbgInstrs(&BB);
+
+  return Changed;
 }
 
 /// Propagate dbg.value intrinsics through the newly inserted PHIs.
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 7a168ff6f32b0d..afc4bbd7227d1c 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -289,8 +289,10 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!canPeel(L))
     return;
 
-  // Only try to peel innermost loops.
-  if (!L->empty())
+  // Only try to peel innermost loops by default.
+  // The constraint can be relaxed by the target in TTI.getUnrollingPreferences
+  // or by the flag -unroll-allow-loop-nests-peeling.
+  if (!UP.AllowLoopNestsPeeling && !L->empty())
     return;
 
   // If the user provided a peel count, use that.
@@ -508,7 +510,10 @@ static void cloneLoopBlocks(
     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);
     NewBlocks.push_back(NewBB);
 
-    if (ParentLoop)
+    // If an original block is an immediate child of the loop L, its copy
+    // is a child of a ParentLoop after peeling. If a block is a child of
+    // a nested loop, it is handled in the cloneLoop() call below.
+    if (ParentLoop && LI->getLoopFor(*BB) == L)
       ParentLoop->addBasicBlockToLoop(NewBB, *LI);
 
     VMap[*BB] = NewBB;
@@ -525,6 +530,12 @@ static void cloneLoopBlocks(
     }
   }
 
+  // Recursively create the new Loop objects for nested loops, if any,
+  // to preserve LoopInfo.
+  for (Loop *ChildLoop : *L) {
+    cloneLoop(ChildLoop, ParentLoop, VMap, LI, nullptr);
+  }
+
   // Hook-up the control flow for the newly inserted blocks.
   // The new header is hooked up directly to the "top", which is either
   // the original loop preheader (for the first iteration) or the previous
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 69020219d9d41d..b86a67faf64658 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -46,6 +46,11 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+static cl::opt<bool> ForceReductionIntrinsic(
+    "force-reduction-intrinsics", cl::Hidden,
+    cl::desc("Force creating reduction intrinsics for testing."),
+    cl::init(false));
+
 #define DEBUG_TYPE "loop-utils"
 
 static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
@@ -1015,7 +1020,8 @@ Value *llvm::createSimpleTargetReduction(
     llvm_unreachable("Unhandled opcode");
     break;
   }
-  if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+  if (ForceReductionIntrinsic ||
+      TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
     return BuildFunc();
   return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
 }
@@ -1499,3 +1505,27 @@ void llvm::appendLoopsToWorklist(LoopInfo &LI,
                                  SmallPriorityWorklist<Loop *, 4> &Worklist) {
   appendReversedLoopsToWorklist(LI, Worklist);
 }
+
+Loop *llvm::cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+                      LoopInfo *LI, LPPassManager *LPM) {
+  Loop &New = *LI->AllocateLoop();
+  if (PL)
+    PL->addChildLoop(&New);
+  else
+    LI->addTopLevelLoop(&New);
+
+  if (LPM)
+    LPM->addLoop(New);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop *I : *L)
+    cloneLoop(I, &New, VM, LI, LPM);
+
+  return &New;
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d3da26ece05125..283cc9cf87146d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7193,8 +7193,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   // ---------------------------------------------------------------------------
 
   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
+  auto Plan = std::make_unique<VPlan>();
   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
-  auto Plan = std::make_unique<VPlan>(VPBB);
+  Plan->setEntry(VPBB);
 
   // Represent values that will have defs inside VPlan.
   for (Value *V : NeedDef)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f1c708720ccf4b..02150f8c2fb2f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -56,6 +56,32 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
   return OS;
 }
 
+// Get the top-most entry block of \p Start. This is the entry block of the
+// containing VPlan. This function is templated to support both const and non-const blocks
+template <typename T> static T *getPlanEntry(T *Start) {
+  T *Next = Start;
+  T *Current = Start;
+  while ((Next = Next->getParent()))
+    Current = Next;
+
+  SmallSetVector<T *, 8> WorkList;
+  WorkList.insert(Current);
+
+  for (unsigned i = 0; i < WorkList.size(); i++) {
+    T *Current = WorkList[i];
+    if (Current->getNumPredecessors() == 0)
+      return Current;
+    auto &Predecessors = Current->getPredecessors();
+    WorkList.insert(Predecessors.begin(), Predecessors.end());
+  }
+
+  llvm_unreachable("VPlan without any entry node without predecessors");
+}
+
+VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; }
+
+const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; }
+
 /// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
 const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
   const VPBlockBase *Block = this;
@@ -71,6 +97,12 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
   return cast<VPBasicBlock>(Block);
 }
 
+void VPBlockBase::setPlan(VPlan *ParentPlan) {
+  assert(ParentPlan->getEntry() == this &&
+         "Can only set plan on its entry block.");
+  Plan = ParentPlan;
+}
+
 /// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
 const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
   const VPBlockBase *Block = this;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c65abc3639d731..914dfe603c5af7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -365,6 +365,10 @@ class VPBlockBase {
   /// Current block predicate - null if the block does not need a predicate.
   VPValue *Predicate = nullptr;
 
+  /// VPlan containing the block. Can only be set on the entry block of the
+  /// plan.
+  VPlan *Plan = nullptr;
+
   /// Add \p Successor as the last successor to this block.
   void appendSuccessor(VPBlockBase *Successor) {
     assert(Successor && "Cannot add nullptr successor!");
@@ -418,6 +422,14 @@ class VPBlockBase {
   VPRegionBlock *getParent() { return Parent; }
   const VPRegionBlock *getParent() const { return Parent; }
 
+  /// \return A pointer to the plan containing the current block.
+  VPlan *getPlan();
+  const VPlan *getPlan() const;
+
+  /// Sets the pointer of the plan containing the block. The block must be the
+  /// entry block into the VPlan.
+  void setPlan(VPlan *ParentPlan);
+
   void setParent(VPRegionBlock *P) { Parent = P; }
 
   /// \return the VPBasicBlock that is the entry of this VPBlockBase,
@@ -1402,7 +1414,11 @@ class VPlan {
   VPBlockBase *getEntry() { return Entry; }
   const VPBlockBase *getEntry() const { return Entry; }
 
-  VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; }
+  VPBlockBase *setEntry(VPBlockBase *Block) {
+    Entry = Block;
+    Block->setPlan(this);
+    return Entry;
+  }
 
   /// The backedge taken count of the original loop.
   VPValue *getOrCreateBackedgeTakenCount() {
diff --git a/llvm/test/Analysis/LoopNestAnalysis/imperfectnest.ll b/llvm/test/Analysis/LoopNestAnalysis/imperfectnest.ll
new file mode 100644
index 00000000000000..4c8066ec58775e
--- /dev/null
+++ b/llvm/test/Analysis/LoopNestAnalysis/imperfectnest.ll
@@ -0,0 +1,493 @@
+; RUN: opt < %s -passes='print<loopnest>' -disable-output 2>&1 | FileCheck %s
+
+; Test an imperfect 2-dim loop nest of the form:
+;   for (int i = 0; i < nx; ++i) {
+;     x[i] = i;
+;     for (int j = 0; j < ny; ++j)
+;       y[j][i] = x[i] + j;
+;   }
+
+define void @imperf_nest_1(i32 signext %nx, i32 signext %ny) {
+; CHECK-LABEL: IsPerfect=false, Depth=2, OutermostLoop: imperf_nest_1_loop_i, Loops: ( imperf_nest_1_loop_i imperf_nest_1_loop_j )
+entry:
+  %0 = zext i32 %ny to i64
+  %1 = zext i32 %nx to i64
+  %2 = mul nuw i64 %0, %1
+  %vla = alloca double, i64 %2, align 8
+  %3 = zext i32 %ny to i64
+  %vla1 = alloca double, i64 %3, align 8
+  br label %imperf_nest_1_loop_i
+
+imperf_nest_1_loop_i:
+  %i2.0 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
+  %cmp = icmp slt i32 %i2.0, %nx
+  br i1 %cmp, label %for.body, label %for.end17
+
+for.body:
+  %conv = sitofp i32 %i2.0 to double
+  %idxprom = sext i32 %i2.0 to i64
+  %arrayidx = getelementptr inbounds double, double* %vla1, i64 %idxprom
+  store double %conv, double* %arrayidx, align 8
+  br label %imperf_nest_1_loop_j
+
+imperf_nest_1_loop_j:
+  %j3.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %cmp5 = icmp slt i32 %j3.0, %ny
+  br i1 %cmp5, label %for.body7, label %for.end
+
+for.body7:
+  %idxprom8 = sext i32 %i2.0 to i64
+  %arrayidx9 = getelementptr inbounds double, double* %vla1, i64 %idxprom8
+  %4 = load double, double* %arrayidx9, align 8
+  %conv10 = sitofp i32 %j3.0 to double
+  %add = fadd double %4, %conv10
+  %idxprom11 = sext i32 %j3.0 to i64
+  %5 = mul nsw i64 %idxprom11, %1
+  %arrayidx12 = getelementptr inbounds double, double* %vla, i64 %5
+  %idxprom13 = sext i32 %i2.0 to i64
+  %arrayidx14 = getelementptr inbounds double, double* %arrayidx12, i64 %idxprom13
+  store double %add, double* %arrayidx14, align 8
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %j3.0, 1
+  br label %imperf_nest_1_loop_j
+
+for.end:
+  br label %for.inc15
+
+for.inc15:
+  %inc16 = add nsw i32 %i2.0, 1
+  br label %imperf_nest_1_loop_i
+
+for.end17:
+  ret void
+}
+
+; Test an imperfect 2-dim loop nest of the form:
+;   for (int i = 0; i < nx; ++i) {
+;     for (int j = 0; j < ny; ++j)
+;       y[j][i] = x[i] + j;
+;     y[0][i] += i;
+;   }
+
+define void @imperf_nest_2(i32 signext %nx, i32 signext %ny) {
+; CHECK-LABEL: IsPerfect=false, Depth=2, OutermostLoop: imperf_nest_2_loop_i, Loops: ( imperf_nest_2_loop_i imperf_nest_2_loop_j )
+entry:
+  %0 = zext i32 %ny to i64
+  %1 = zext i32 %nx to i64
+  %2 = mul nuw i64 %0, %1
+  %vla = alloca double, i64 %2, align 8
+  %3 = zext i32 %ny to i64
+  %vla1 = alloca double, i64 %3, align 8
+  br label %imperf_nest_2_loop_i
+
+imperf_nest_2_loop_i:
+  %i2.0 = phi i32 [ 0, %entry ], [ %inc17, %for.inc16 ]
+  %cmp = icmp slt i32 %i2.0, %nx
+  br i1 %cmp, label %for.body, label %for.end18
+ 
+for.body:
+  br label %imperf_nest_2_loop_j
+
+imperf_nest_2_loop_j:
+  %j3.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %cmp5 = icmp slt i32 %j3.0, %ny
+  br i1 %cmp5, label %for.body6, label %for.end
+
+for.body6:
+  %idxprom = sext i32 %i2.0 to i64
+  %arrayidx = getelementptr inbounds double, double* %vla1, i64 %idxprom
+  %4 = load double, double* %arrayidx, align 8
+  %conv = sitofp i32 %j3.0 to double
+  %add = fadd double %4, %conv
+  %idxprom7 = sext i32 %j3.0 to i64
+  %5 = mul nsw i64 %idxprom7, %1
+  %arrayidx8 = getelementptr inbounds double, double* %vla, i64 %5
+  %idxprom9 = sext i32 %i2.0 to i64
+  %arrayidx10 = getelementptr inbounds double, double* %arrayidx8, i64 %idxprom9
+  store double %add, double* %arrayidx10, align 8
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %j3.0, 1
+  br label %imperf_nest_2_loop_j
+
+for.end:
+  %conv11 = sitofp i32 %i2.0 to double
+  %6 = mul nsw i64 0, %1
+  %arrayidx12 = getelementptr inbounds double, double* %vla, i64 %6
+  %idxprom13 = sext i32 %i2.0 to i64
+  %arrayidx14 = getelementptr inbounds double, double* %arrayidx12, i64 %idxprom13
+  %7 = load double, double* %arrayidx14, align 8
+  %add15 = fadd double %7, %conv11
+  store double %add15, double* %arrayidx14, align 8
+  br label %for.inc16
+
+for.inc16:
+  %inc17 = add nsw i32 %i2.0, 1
+  br label %imperf_nest_2_loop_i
+
+for.end18:
+  ret void
+}
+
+; Test an imperfect 2-dim loop nest of the form:
+;   for (i = 0; i < nx; ++i) {
+;     for (j = 0; j < ny-nk; ++j)
+;       y[i][j] = x[i] + j;
+;     for (j = ny-nk; j < ny; ++j)
+;       y[i][j] = x[i] - j;
+;   }
+
+define void @imperf_nest_3(i32 signext %nx, i32 signext %ny, i32 signext %nk) {
+; CHECK-LABEL: IsPerfect=false, Depth=2, OutermostLoop: imperf_nest_3_loop_i, Loops: ( imperf_nest_3_loop_i imperf_nest_3_loop_j imperf_nest_3_loop_k )
+entry:
+  %0 = zext i32 %nx to i64
+  %1 = zext i32 %ny to i64
+  %2 = mul nuw i64 %0, %1
+  %vla = alloca double, i64 %2, align 8
+  %3 = zext i32 %ny to i64
+  %vla1 = alloca double, i64 %3, align 8
+  br label %imperf_nest_3_loop_i
+
+imperf_nest_3_loop_i:                                         ; preds = %for.inc25, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc26, %for.inc25 ]
+  %cmp = icmp slt i32 %i.0, %nx
+  br i1 %cmp, label %for.body, label %for.end27
+
+for.body:                                         ; preds = %for.cond
+  br label %imperf_nest_3_loop_j
+
+imperf_nest_3_loop_j:                                        ; preds = %for.inc, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %sub = sub nsw i32 %ny, %nk
+  %cmp3 = icmp slt i32 %j.0, %sub
+  br i1 %cmp3, label %for.body4, label %for.end
+
+for.body4:                                        ; preds = %imperf_nest_3_loop_j
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds double, double* %vla1, i64 %idxprom
+  %4 = load double, double* %arrayidx, align 8
+  %conv = sitofp i32 %j.0 to double
+  %add = fadd double %4, %conv
+  %idxprom5 = sext i32 %i.0 to i64
+  %5 = mul nsw i64 %idxprom5, %1
+  %arrayidx6 = getelementptr inbounds double, double* %vla, i64 %5
+  %idxprom7 = sext i32 %j.0 to i64
+  %arrayidx8 = getelementptr inbounds double, double* %arrayidx6, i64 %idxprom7
+  store double %add, double* %arrayidx8, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body4
+  %inc = add nsw i32 %j.0, 1
+  br label %imperf_nest_3_loop_j
+
+for.end:                                          ; preds = %imperf_nest_3_loop_j
+  %sub9 = sub nsw i32 %ny, %nk
+  br label %imperf_nest_3_loop_k
+
+imperf_nest_3_loop_k:                                       ; preds = %for.inc22, %for.end
+  %j.1 = phi i32 [ %sub9, %for.end ], [ %inc23, %for.inc22 ]
+  %cmp11 = icmp slt i32 %j.1, %ny
+  br i1 %cmp11, label %for.body13, label %for.end24
+
+for.body13:                                       ; preds = %imperf_nest_3_loop_k
+  %idxprom14 = sext i32 %i.0 to i64
+  %arrayidx15 = getelementptr inbounds double, double* %vla1, i64 %idxprom14
+  %6 = load double, double* %arrayidx15, align 8
+  %conv16 = sitofp i32 %j.1 to double
+  %sub17 = fsub double %6, %conv16
+  %idxprom18 = sext i32 %i.0 to i64
+  %7 = mul nsw i64 %idxprom18, %1
+  %arrayidx19 = getelementptr inbounds double, double* %vla, i64 %7
+  %idxprom20 = sext i32 %j.1 to i64
+  %arrayidx21 = getelementptr inbounds double, double* %arrayidx19, i64 %idxprom20
+  store double %sub17, double* %arrayidx21, align 8
+  br label %for.inc22
+
+for.inc22:                                        ; preds = %for.body13
+  %inc23 = add nsw i32 %j.1, 1
+  br label %imperf_nest_3_loop_k
+
+for.end24:                                        ; preds = %imperf_nest_3_loop_k
+  br label %for.inc25
+
+for.inc25:                                        ; preds = %for.end24
+  %inc26 = add nsw i32 %i.0, 1
+  br label %imperf_nest_3_loop_i
+
+for.end27:                                        ; preds = %for.cond
+  ret void
+}
+
+; Test an imperfect loop nest of the form:
+;   for (i = 0; i < nx; ++i) {
+;     for (j = 0; j < ny-nk; ++j)
+;       for (k = 0; k < nk; ++k)
+;         y[i][j][k] = x[i+j] + k;
+;     for (j = ny-nk; j < ny; ++j)
+;       y[i][j][0] = x[i] - j;
+;   }
+
+define void @imperf_nest_4(i32 signext %nx, i32 signext %ny, i32 signext %nk) {
+; CHECK-LABEL: IsPerfect=false, Depth=2, OutermostLoop: imperf_nest_4_loop_j, Loops: ( imperf_nest_4_loop_j imperf_nest_4_loop_k )
+; CHECK-LABEL: IsPerfect=false, Depth=3, OutermostLoop: imperf_nest_4_loop_i, Loops: ( imperf_nest_4_loop_i imperf_nest_4_loop_j imperf_nest_4_loop_j2 imperf_nest_4_loop_k )
+entry:
+  %0 = zext i32 %nx to i64
+  %1 = zext i32 %ny to i64
+  %2 = zext i32 %nk to i64
+  %3 = mul nuw i64 %0, %1
+  %4 = mul nuw i64 %3, %2
+  %vla = alloca double, i64 %4, align 8
+  %5 = zext i32 %ny to i64
+  %vla1 = alloca double, i64 %5, align 8
+  %cmp5 = icmp slt i32 0, %nx
+  br i1 %cmp5, label %imperf_nest_4_loop_i.lr.ph, label %for.end37
+
+imperf_nest_4_loop_i.lr.ph:
+  br label %imperf_nest_4_loop_i
+
+imperf_nest_4_loop_i:
+  %i.0 = phi i32 [ 0, %imperf_nest_4_loop_i.lr.ph ], [ %inc36, %for.inc35 ]
+  %sub2 = sub nsw i32 %ny, %nk
+  %cmp33 = icmp slt i32 0, %sub2
+  br i1 %cmp33, label %imperf_nest_4_loop_j.lr.ph, label %for.end17
+
+imperf_nest_4_loop_j.lr.ph:
+  br label %imperf_nest_4_loop_j
+
+imperf_nest_4_loop_j:
+  %j.0 = phi i32 [ 0, %imperf_nest_4_loop_j.lr.ph ], [ %inc16, %for.inc15 ]
+  %cmp61 = icmp slt i32 0, %nk
+  br i1 %cmp61, label %imperf_nest_4_loop_k.lr.ph, label %for.end
+
+imperf_nest_4_loop_k.lr.ph:
+  br label %imperf_nest_4_loop_k
+
+imperf_nest_4_loop_k:
+  %k.0 = phi i32 [ 0, %imperf_nest_4_loop_k.lr.ph ], [ %inc, %for.inc ]
+  %add = add nsw i32 %i.0, %j.0
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds double, double* %vla1, i64 %idxprom
+  %6 = load double, double* %arrayidx, align 8
+  %conv = sitofp i32 %k.0 to double
+  %add8 = fadd double %6, %conv
+  %idxprom9 = sext i32 %i.0 to i64
+  %7 = mul nuw i64 %1, %2
+  %8 = mul nsw i64 %idxprom9, %7
+  %arrayidx10 = getelementptr inbounds double, double* %vla, i64 %8
+  %idxprom11 = sext i32 %j.0 to i64
+  %9 = mul nsw i64 %idxprom11, %2
+  %arrayidx12 = getelementptr inbounds double, double* %arrayidx10, i64 %9
+  %idxprom13 = sext i32 %k.0 to i64
+  %arrayidx14 = getelementptr inbounds double, double* %arrayidx12, i64 %idxprom13
+  store double %add8, double* %arrayidx14, align 8
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %k.0, 1
+  %cmp6 = icmp slt i32 %inc, %nk
+  br i1 %cmp6, label %imperf_nest_4_loop_k, label %for.cond5.for.end_crit_edge
+
+for.cond5.for.end_crit_edge:
+  br label %for.end
+
+for.end:
+  br label %for.inc15
+
+for.inc15:
+  %inc16 = add nsw i32 %j.0, 1
+  %sub = sub nsw i32 %ny, %nk
+  %cmp3 = icmp slt i32 %inc16, %sub
+  br i1 %cmp3, label %imperf_nest_4_loop_j, label %for.cond2.for.end17_crit_edge
+
+for.cond2.for.end17_crit_edge:
+  br label %for.end17
+
+for.end17:
+  %sub18 = sub nsw i32 %ny, %nk
+  %cmp204 = icmp slt i32 %sub18, %ny
+  br i1 %cmp204, label %imperf_nest_4_loop_j2.lr.ph, label %for.end34
+
+imperf_nest_4_loop_j2.lr.ph:
+  br label %imperf_nest_4_loop_j2
+
+imperf_nest_4_loop_j2:
+  %j.1 = phi i32 [ %sub18, %imperf_nest_4_loop_j2.lr.ph ], [ %inc33, %for.inc32 ]
+  %idxprom23 = sext i32 %i.0 to i64
+  %arrayidx24 = getelementptr inbounds double, double* %vla1, i64 %idxprom23
+  %10 = load double, double* %arrayidx24, align 8
+  %conv25 = sitofp i32 %j.1 to double
+  %sub26 = fsub double %10, %conv25
+  %idxprom27 = sext i32 %i.0 to i64
+  %idxprom29 = sext i32 %j.1 to i64
+  %11 = mul nsw i64 %idxprom29, %2
+  %12 = mul nuw i64 %1, %2
+  %13 = mul nsw i64 %idxprom27, %12
+  %arrayidx28 = getelementptr inbounds double, double* %vla, i64 %13
+  %arrayidx30 = getelementptr inbounds double, double* %arrayidx28, i64 %11
+  %arrayidx31 = getelementptr inbounds double, double* %arrayidx30, i64 0
+  store double %sub26, double* %arrayidx31, align 8
+  br label %for.inc32
+
+for.inc32:
+  %inc33 = add nsw i32 %j.1, 1
+  %cmp20 = icmp slt i32 %inc33, %ny
+  br i1 %cmp20, label %imperf_nest_4_loop_j2, label %for.cond19.for.end34_crit_edge
+
+for.cond19.for.end34_crit_edge:
+  br label %for.end34
+
+for.end34:                   
+  br label %for.inc35
+
+for.inc35:                   
+  %inc36 = add nsw i32 %i.0, 1
+  %cmp = icmp slt i32 %inc36, %nx
+  br i1 %cmp, label %imperf_nest_4_loop_i, label %for.cond.for.end37_crit_edge
+
+for.cond.for.end37_crit_edge:
+  br label %for.end37
+
+for.end37:
+  ret void
+}
+
+; Test an imperfect loop nest of the form:
+;   for (int i = 0; i < nx; ++i)
+;     if (i > 5) {
+;       for (int j = 0; j < ny; ++j)
+;         y[j][i] = x[i][j] + j;
+;     }
+
+define void @imperf_nest_5(i32** %y, i32** %x, i32 signext %nx, i32 signext %ny) {
+; CHECK-LABEL: IsPerfect=false, Depth=2, OutermostLoop: imperf_nest_5_loop_i, Loops: ( imperf_nest_5_loop_i imperf_nest_5_loop_j )
+entry:
+  %cmp2 = icmp slt i32 0, %nx
+  br i1 %cmp2, label %imperf_nest_5_loop_i.lr.ph, label %for.end13
+
+imperf_nest_5_loop_i.lr.ph:
+  br label %imperf_nest_5_loop_i
+
+imperf_nest_5_loop_i:      
+  %i.0 = phi i32 [ 0, %imperf_nest_5_loop_i.lr.ph ], [ %inc12, %for.inc11 ]
+  %cmp1 = icmp sgt i32 %i.0, 5
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:         
+  %cmp31 = icmp slt i32 0, %ny
+  br i1 %cmp31, label %imperf_nest_5_loop_j.lr.ph, label %for.end
+
+imperf_nest_5_loop_j.lr.ph:
+  br label %imperf_nest_5_loop_j
+
+imperf_nest_5_loop_j:      
+  %j.0 = phi i32 [ 0, %imperf_nest_5_loop_j.lr.ph ], [ %inc, %for.inc ]
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32*, i32** %x, i64 %idxprom
+  %0 = load i32*, i32** %arrayidx, align 8
+  %idxprom5 = sext i32 %j.0 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %0, i64 %idxprom5
+  %1 = load i32, i32* %arrayidx6, align 4
+  %add = add nsw i32 %1, %j.0
+  %idxprom7 = sext i32 %j.0 to i64
+  %arrayidx8 = getelementptr inbounds i32*, i32** %y, i64 %idxprom7
+  %2 = load i32*, i32** %arrayidx8, align 8
+  %idxprom9 = sext i32 %i.0 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %2, i64 %idxprom9
+  store i32 %add, i32* %arrayidx10, align 4
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %j.0, 1
+  %cmp3 = icmp slt i32 %inc, %ny
+  br i1 %cmp3, label %imperf_nest_5_loop_j, label %for.cond2.for.end_crit_edge
+
+for.cond2.for.end_crit_edge:
+  br label %for.end
+
+for.end:                    
+  br label %if.end
+
+if.end:                     
+  br label %for.inc11
+
+for.inc11:                  
+  %inc12 = add nsw i32 %i.0, 1
+  %cmp = icmp slt i32 %inc12, %nx
+  br i1 %cmp, label %imperf_nest_5_loop_i, label %for.cond.for.end13_crit_edge
+
+for.cond.for.end13_crit_edge:
+  br label %for.end13
+
+for.end13:                   
+  ret void
+}
+
+; Test an imperfect loop nest of the form:
+;   for (int i = 0; i < nx; ++i)
+;     if (i > 5) { // user branch
+;       for (int j = 1; j <= 5; j+=2)
+;         y[j][i] = x[i][j] + j;
+;     }
+
+define void @imperf_nest_6(i32** %y, i32** %x, i32 signext %nx, i32 signext %ny) {
+;    CHECK-LABEL: IsPerfect=false, Depth=2, OutermostLoop: imperf_nest_6_loop_i, Loops: ( imperf_nest_6_loop_i imperf_nest_6_loop_j )
+entry:
+  %cmp2 = icmp slt i32 0, %nx
+  br i1 %cmp2, label %imperf_nest_6_loop_i.lr.ph, label %for.end13
+
+imperf_nest_6_loop_i.lr.ph:
+  br label %imperf_nest_6_loop_i
+
+imperf_nest_6_loop_i:      
+  %i.0 = phi i32 [ 0, %imperf_nest_6_loop_i.lr.ph ], [ %inc12, %for.inc11 ]
+  %cmp1 = icmp sgt i32 %i.0, 5
+  br i1 %cmp1, label %imperf_nest_6_loop_j.lr.ph, label %if.end
+
+imperf_nest_6_loop_j.lr.ph:
+  br label %imperf_nest_6_loop_j
+
+imperf_nest_6_loop_j:      
+  %j.0 = phi i32 [ 1, %imperf_nest_6_loop_j.lr.ph ], [ %inc, %for.inc ]
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32*, i32** %x, i64 %idxprom
+  %0 = load i32*, i32** %arrayidx, align 8
+  %idxprom5 = sext i32 %j.0 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %0, i64 %idxprom5
+  %1 = load i32, i32* %arrayidx6, align 4
+  %add = add nsw i32 %1, %j.0
+  %idxprom7 = sext i32 %j.0 to i64
+  %arrayidx8 = getelementptr inbounds i32*, i32** %y, i64 %idxprom7
+  %2 = load i32*, i32** %arrayidx8, align 8
+  %idxprom9 = sext i32 %i.0 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %2, i64 %idxprom9
+  store i32 %add, i32* %arrayidx10, align 4
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %j.0, 2
+  %cmp3 = icmp sle i32 %inc, 5
+  br i1 %cmp3, label %imperf_nest_6_loop_j, label %for.cond2.for.end_crit_edge
+
+for.cond2.for.end_crit_edge:
+  br label %for.end
+
+for.end:                    
+  br label %if.end
+
+if.end:                     
+  br label %for.inc11
+
+for.inc11:                  
+  %inc12 = add nsw i32 %i.0, 1
+  %cmp = icmp slt i32 %inc12, %nx
+  br i1 %cmp, label %imperf_nest_6_loop_i, label %for.cond.for.end13_crit_edge
+
+for.cond.for.end13_crit_edge:
+  br label %for.end13
+
+for.end13:                   
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopNestAnalysis/infinite.ll b/llvm/test/Analysis/LoopNestAnalysis/infinite.ll
new file mode 100644
index 00000000000000..7a6cf21584fff2
--- /dev/null
+++ b/llvm/test/Analysis/LoopNestAnalysis/infinite.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -passes='print<loopnest>' -disable-output 2>&1 | FileCheck %s
+
+; Test that the loop nest analysis is able to analyze an infinite loop in a loop nest.
+define void @test1(i32** %A, i1 %cond) {
+; CHECK-LABEL: IsPerfect=true, Depth=1, OutermostLoop: for.inner, Loops: ( for.inner )
+; CHECK-LABEL: IsPerfect=false, Depth=2, OutermostLoop: for.outer, Loops: ( for.outer for.inner )
+; CHECK-LABEL: IsPerfect=true, Depth=1, OutermostLoop: for.infinite, Loops: ( for.infinite )
+entry:
+  br label %for.outer
+
+for.outer:
+  %i = phi i64 [ 0, %entry ], [ %inc_i, %for.outer.latch ]
+  br i1 %cond, label %for.inner, label %for.infinite
+
+for.inner:
+  %j = phi i64 [ 0, %for.outer ], [ %inc_j, %for.inner ]
+  %arrayidx_i = getelementptr inbounds i32*, i32** %A, i64 %i
+  %0 = load i32*, i32** %arrayidx_i, align 8
+  %arrayidx_j = getelementptr inbounds i32, i32* %0, i64 %j
+  store i32 0, i32* %arrayidx_j, align 4
+  %inc_j = add nsw i64 %j, 1
+  %cmp_j = icmp slt i64 %inc_j, 100
+  br i1 %cmp_j, label %for.inner, label %for.outer.latch
+
+for.infinite:
+  br label %for.infinite
+
+for.outer.latch:
+  %inc_i = add nsw i64 %i, 1
+  %cmp_i = icmp slt i64 %inc_i, 100
+  br i1 %cmp_i, label %for.outer, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopNestAnalysis/perfectnest.ll b/llvm/test/Analysis/LoopNestAnalysis/perfectnest.ll
new file mode 100644
index 00000000000000..b7b3b7a7c93e0c
--- /dev/null
+++ b/llvm/test/Analysis/LoopNestAnalysis/perfectnest.ll
@@ -0,0 +1,275 @@
+; RUN: opt < %s -passes='print<loopnest>' -disable-output 2>&1 | FileCheck %s
+
+; Test a perfect 2-dim loop nest of the form:
+;   for(i=0; i<nx; ++i)
+;     for(j=0; j<nx; ++j)
+;       y[i][j] = x[i][j];
+
+define void @perf_nest_2D_1(i32** %y, i32** %x, i64 signext %nx, i64 signext %ny) {
+; CHECK-LABEL: IsPerfect=true, Depth=1, OutermostLoop: perf_nest_2D_1_loop_j, Loops: ( perf_nest_2D_1_loop_j )
+; CHECK-LABEL: IsPerfect=true, Depth=2, OutermostLoop: perf_nest_2D_1_loop_i, Loops: ( perf_nest_2D_1_loop_i perf_nest_2D_1_loop_j )
+entry:
+  br label %perf_nest_2D_1_loop_i
+
+perf_nest_2D_1_loop_i:
+  %i = phi i64 [ 0, %entry ], [ %inc13, %inc_i ]
+  %cmp21 = icmp slt i64 0, %ny
+  br i1 %cmp21, label %perf_nest_2D_1_loop_j, label %inc_i
+
+perf_nest_2D_1_loop_j:
+  %j = phi i64 [ 0, %perf_nest_2D_1_loop_i ], [ %inc, %inc_j ]
+  %arrayidx = getelementptr inbounds i32*, i32** %x, i64 %j
+  %0 = load i32*, i32** %arrayidx, align 8
+  %arrayidx6 = getelementptr inbounds i32, i32* %0, i64 %j
+  %1 = load i32, i32* %arrayidx6, align 4
+  %arrayidx8 = getelementptr inbounds i32*, i32** %y, i64 %j
+  %2 = load i32*, i32** %arrayidx8, align 8
+  %arrayidx11 = getelementptr inbounds i32, i32* %2, i64 %i
+  store i32 %1, i32* %arrayidx11, align 4
+  br label %inc_j
+
+inc_j:
+  %inc = add nsw i64 %j, 1
+  %cmp2 = icmp slt i64 %inc, %ny
+  br i1 %cmp2, label %perf_nest_2D_1_loop_j, label %inc_i
+
+inc_i:
+  %inc13 = add nsw i64 %i, 1
+  %cmp = icmp slt i64 %inc13, %nx
+  br i1 %cmp, label %perf_nest_2D_1_loop_i, label %perf_nest_2D_1_loop_i_end
+
+perf_nest_2D_1_loop_i_end:
+  ret void
+}
+
+; Test a perfect 2-dim loop nest of the form:
+;   for (i=0; i<100; ++i)
+;     for (j=0; j<100; ++j)
+;       y[i][j] = x[i][j];
+define void @perf_nest_2D_2(i32** %y, i32** %x) {
+; CHECK-LABEL: IsPerfect=true, Depth=1, OutermostLoop: perf_nest_2D_2_loop_j, Loops: ( perf_nest_2D_2_loop_j )
+; CHECK-LABEL: IsPerfect=true, Depth=2, OutermostLoop: perf_nest_2D_2_loop_i, Loops: ( perf_nest_2D_2_loop_i perf_nest_2D_2_loop_j )
+entry:
+  br label %perf_nest_2D_2_loop_i
+
+perf_nest_2D_2_loop_i:
+  %i = phi i64 [ 0, %entry ], [ %inc13, %inc_i ]
+  br label %perf_nest_2D_2_loop_j
+
+perf_nest_2D_2_loop_j:
+  %j = phi i64 [ 0, %perf_nest_2D_2_loop_i ], [ %inc, %inc_j ]
+  %arrayidx = getelementptr inbounds i32*, i32** %x, i64 %j
+  %0 = load i32*, i32** %arrayidx, align 8
+  %arrayidx6 = getelementptr inbounds i32, i32* %0, i64 %j
+  %1 = load i32, i32* %arrayidx6, align 4
+  %arrayidx8 = getelementptr inbounds i32*, i32** %y, i64 %j
+  %2 = load i32*, i32** %arrayidx8, align 8
+  %arrayidx11 = getelementptr inbounds i32, i32* %2, i64 %i
+  store i32 %1, i32* %arrayidx11, align 4
+  br label %inc_j
+
+inc_j:
+  %inc = add nsw i64 %j, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %perf_nest_2D_2_loop_j, label %loop_j_end
+
+loop_j_end:
+  br label %inc_i
+
+inc_i:
+  %inc13 = add nsw i64 %i, 1
+  %cmp = icmp slt i64 %inc13, 100
+  br i1 %cmp, label %perf_nest_2D_2_loop_i, label %perf_nest_2D_2_loop_i_end
+
+perf_nest_2D_2_loop_i_end:
+  ret void
+}
+
+; Test a perfect 3-dim loop nest of the form:
+;   for (i=0; i<nx; ++i)
+;     for (j=0; j<ny; ++j)
+;       for (k=0; j<nk; ++k)
+;          y[j][j][k] = x[i][j][k];
+;
+
+define void @perf_nest_3D_1(i32*** %y, i32*** %x, i32 signext %nx, i32 signext %ny, i32 signext %nk) {
+; CHECK-LABEL: IsPerfect=true, Depth=1, OutermostLoop: perf_nest_3D_1_loop_k, Loops: ( perf_nest_3D_1_loop_k )
+; CHECK-NEXT: IsPerfect=true, Depth=2, OutermostLoop: perf_nest_3D_1_loop_j, Loops: ( perf_nest_3D_1_loop_j perf_nest_3D_1_loop_k )
+; CHECK-NEXT: IsPerfect=true, Depth=3, OutermostLoop: perf_nest_3D_1_loop_i, Loops: ( perf_nest_3D_1_loop_i perf_nest_3D_1_loop_j perf_nest_3D_1_loop_k )
+entry:
+  br label %perf_nest_3D_1_loop_i
+
+perf_nest_3D_1_loop_i:
+  %i = phi i32 [ 0, %entry ], [ %inci, %for.inci ]
+  %cmp21 = icmp slt i32 0, %ny
+  br i1 %cmp21, label %perf_nest_3D_1_loop_j, label %for.inci
+
+perf_nest_3D_1_loop_j:
+  %j = phi i32 [ 0, %perf_nest_3D_1_loop_i ], [ %incj, %for.incj ]
+  %cmp22 = icmp slt i32 0, %nk
+  br i1 %cmp22, label %perf_nest_3D_1_loop_k, label %for.incj
+
+perf_nest_3D_1_loop_k:
+  %k = phi i32 [ 0, %perf_nest_3D_1_loop_j ], [ %inck, %for.inck ]
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i32**, i32*** %x, i64 %idxprom
+  %0 = load i32**, i32*** %arrayidx, align 8
+  %idxprom7 = sext i32 %j to i64
+  %arrayidx8 = getelementptr inbounds i32*, i32** %0, i64 %idxprom7
+  %1 = load i32*, i32** %arrayidx8, align 8
+  %idxprom9 = sext i32 %k to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %1, i64 %idxprom9
+  %2 = load i32, i32* %arrayidx10, align 4
+  %idxprom11 = sext i32 %j to i64
+  %arrayidx12 = getelementptr inbounds i32**, i32*** %y, i64 %idxprom11
+  %3 = load i32**, i32*** %arrayidx12, align 8
+  %idxprom13 = sext i32 %j to i64
+  %arrayidx14 = getelementptr inbounds i32*, i32** %3, i64 %idxprom13
+  %4 = load i32*, i32** %arrayidx14, align 8
+  %idxprom15 = sext i32 %k to i64
+  %arrayidx16 = getelementptr inbounds i32, i32* %4, i64 %idxprom15
+  store i32 %2, i32* %arrayidx16, align 4
+  br label %for.inck
+
+for.inck:
+  %inck = add nsw i32 %k, 1
+  %cmp5 = icmp slt i32 %inck, %nk
+  br i1 %cmp5, label %perf_nest_3D_1_loop_k, label %for.incj
+
+for.incj:
+  %incj = add nsw i32 %j, 1
+  %cmp2 = icmp slt i32 %incj, %ny
+  br i1 %cmp2, label %perf_nest_3D_1_loop_j, label %for.inci
+
+for.inci:
+  %inci = add nsw i32 %i, 1
+  %cmp = icmp slt i32 %inci, %nx
+  br i1 %cmp, label %perf_nest_3D_1_loop_i, label %perf_nest_3D_1_loop_i_end
+
+perf_nest_3D_1_loop_i_end:
+  ret void
+}
+
+; Test a perfect 3-dim loop nest of the form:
+;   for (i=0; i<100; ++i)
+;     for (j=0; j<100; ++j)
+;       for (k=0; j<100; ++k)
+;          y[j][j][k] = x[i][j][k];
+;
+
+define void @perf_nest_3D_2(i32*** %y, i32*** %x) {
+; CHECK-LABEL: IsPerfect=true, Depth=1, OutermostLoop: perf_nest_3D_2_loop_k, Loops: ( perf_nest_3D_2_loop_k )
+; CHECK-NEXT: IsPerfect=true, Depth=2, OutermostLoop: perf_nest_3D_2_loop_j, Loops: ( perf_nest_3D_2_loop_j perf_nest_3D_2_loop_k )
+; CHECK-NEXT: IsPerfect=true, Depth=3, OutermostLoop: perf_nest_3D_2_loop_i, Loops: ( perf_nest_3D_2_loop_i perf_nest_3D_2_loop_j perf_nest_3D_2_loop_k )
+entry:
+  br label %perf_nest_3D_2_loop_i
+
+perf_nest_3D_2_loop_i:
+  %i = phi i32 [ 0, %entry ], [ %inci, %for.inci ]
+  br label %perf_nest_3D_2_loop_j
+
+perf_nest_3D_2_loop_j:
+  %j = phi i32 [ 0, %perf_nest_3D_2_loop_i ], [ %incj, %for.incj ]
+  br label %perf_nest_3D_2_loop_k
+
+perf_nest_3D_2_loop_k:
+  %k = phi i32 [ 0, %perf_nest_3D_2_loop_j ], [ %inck, %for.inck ]
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i32**, i32*** %x, i64 %idxprom
+  %0 = load i32**, i32*** %arrayidx, align 8
+  %idxprom7 = sext i32 %j to i64
+  %arrayidx8 = getelementptr inbounds i32*, i32** %0, i64 %idxprom7
+  %1 = load i32*, i32** %arrayidx8, align 8
+  %idxprom9 = sext i32 %k to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %1, i64 %idxprom9
+  %2 = load i32, i32* %arrayidx10, align 4
+  %idxprom11 = sext i32 %j to i64
+  %arrayidx12 = getelementptr inbounds i32**, i32*** %y, i64 %idxprom11
+  %3 = load i32**, i32*** %arrayidx12, align 8
+  %idxprom13 = sext i32 %j to i64
+  %arrayidx14 = getelementptr inbounds i32*, i32** %3, i64 %idxprom13
+  %4 = load i32*, i32** %arrayidx14, align 8
+  %idxprom15 = sext i32 %k to i64
+  %arrayidx16 = getelementptr inbounds i32, i32* %4, i64 %idxprom15
+  store i32 %2, i32* %arrayidx16, align 4
+  br label %for.inck
+
+for.inck:
+  %inck = add nsw i32 %k, 1
+  %cmp5 = icmp slt i32 %inck, 100
+  br i1 %cmp5, label %perf_nest_3D_2_loop_k, label %loop_k_end
+
+loop_k_end:
+  br label %for.incj
+
+for.incj:
+  %incj = add nsw i32 %j, 1
+  %cmp2 = icmp slt i32 %incj, 100
+  br i1 %cmp2, label %perf_nest_3D_2_loop_j, label %loop_j_end
+
+loop_j_end:
+  br label %for.inci
+
+for.inci:
+  %inci = add nsw i32 %i, 1
+  %cmp = icmp slt i32 %inci, 100
+  br i1 %cmp, label %perf_nest_3D_2_loop_i, label %perf_nest_3D_2_loop_i_end
+
+perf_nest_3D_2_loop_i_end:
+  ret void
+}
+
+; Test a perfect loop nest with a live out reduction:
+;   for (i = 0; i<ni; ++i)
+;     if (0<nj) { // guard branch for the j-loop
+;       for (j=0; j<nj; j+=1)
+;         x+=(i+j);
+;     }
+;   return x;
+
+define signext i32 @perf_nest_live_out(i32 signext %x, i32 signext %ni, i32 signext %nj) {
+; CHECK-LABEL: IsPerfect=true, Depth=1, OutermostLoop: perf_nest_live_out_loop_j, Loops: ( perf_nest_live_out_loop_j )
+; CHECK-LABEL: IsPerfect=true, Depth=2, OutermostLoop: perf_nest_live_out_loop_i, Loops: ( perf_nest_live_out_loop_i perf_nest_live_out_loop_j )
+entry:
+  %cmp4 = icmp slt i32 0, %ni
+  br i1 %cmp4, label %perf_nest_live_out_loop_i.lr.ph, label %for.end7
+
+perf_nest_live_out_loop_i.lr.ph:
+  br label %perf_nest_live_out_loop_i
+
+perf_nest_live_out_loop_i:
+  %x.addr.06 = phi i32 [ %x, %perf_nest_live_out_loop_i.lr.ph ], [ %x.addr.1.lcssa, %for.inc5 ]
+  %i.05 = phi i32 [ 0, %perf_nest_live_out_loop_i.lr.ph ], [ %inc6, %for.inc5 ]
+  %cmp21 = icmp slt i32 0, %nj
+  br i1 %cmp21, label %perf_nest_live_out_loop_j.lr.ph, label %for.inc5
+
+perf_nest_live_out_loop_j.lr.ph:
+  br label %perf_nest_live_out_loop_j
+
+perf_nest_live_out_loop_j:
+  %x.addr.13 = phi i32 [ %x.addr.06, %perf_nest_live_out_loop_j.lr.ph ], [ %add4, %perf_nest_live_out_loop_j ]
+  %j.02 = phi i32 [ 0, %perf_nest_live_out_loop_j.lr.ph ], [ %inc, %perf_nest_live_out_loop_j ]
+  %add = add nsw i32 %i.05, %j.02
+  %add4 = add nsw i32 %x.addr.13, %add
+  %inc = add nsw i32 %j.02, 1
+  %cmp2 = icmp slt i32 %inc, %nj
+  br i1 %cmp2, label %perf_nest_live_out_loop_j, label %for.cond1.for.inc5_crit_edge
+
+for.cond1.for.inc5_crit_edge:
+  %split = phi i32 [ %add4, %perf_nest_live_out_loop_j ]
+  br label %for.inc5
+
+for.inc5:
+  %x.addr.1.lcssa = phi i32 [ %split, %for.cond1.for.inc5_crit_edge ], [ %x.addr.06, %perf_nest_live_out_loop_i ]
+  %inc6 = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %inc6, %ni
+  br i1 %cmp, label %perf_nest_live_out_loop_i, label %for.cond.for.end7_crit_edge
+
+for.cond.for.end7_crit_edge:
+  %split7 = phi i32 [ %x.addr.1.lcssa, %for.inc5 ]
+  br label %for.end7
+
+for.end7:
+  %x.addr.0.lcssa = phi i32 [ %split7, %for.cond.for.end7_crit_edge ], [ %x, %entry ]
+  ret i32 %x.addr.0.lcssa
+}
diff --git a/llvm/test/Bitcode/summary_version.ll b/llvm/test/Bitcode/summary_version.ll
index 98feab6fe2f995..2a67073713c0b9 100644
--- a/llvm/test/Bitcode/summary_version.ll
+++ b/llvm/test/Bitcode/summary_version.ll
@@ -2,7 +2,7 @@
 ; RUN: opt  -module-summary  %s -o - | llvm-bcanalyzer -dump | FileCheck %s
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK: <VERSION op0=9/>
+; CHECK: <VERSION op0=8/>
 
 
 
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 9433fd1a31b0a9..d1bc970f3643a2 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -1,12 +1,12 @@
 llvm_canonicalize_cmake_booleans(
   BUILD_SHARED_LIBS
   HAVE_LIBXAR
+  HAVE_LIBZ
   HAVE_OCAMLOPT
   HAVE_OCAML_OUNIT
   LLVM_ENABLE_DIA_SDK
   LLVM_ENABLE_FFI
   LLVM_ENABLE_THREADS
-  LLVM_ENABLE_ZLIB
   LLVM_INCLUDE_GO_TESTS
   LLVM_LIBXML2_ENABLED
   LLVM_LINK_LLVM_DYLIB
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-cfi.mir b/llvm/test/CodeGen/AArch64/machine-outliner-cfi.mir
new file mode 100644
index 00000000000000..707785a566a217
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-cfi.mir
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-apple-unknown -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s
+
+# Outlining CFI instructions is unsafe. It is possible if the call is tail
+# called, but otherwise, it requires fixups. Show that we don't include CFI
+# instructions in outlined sequences right now.
+
+--- |
+  define void @foo() #0 { ret void }
+  define void @bar() #0 { ret void }
+  define void @baz() #0 { ret void }
+  attributes #0 = { noredzone }
+...
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+    ; CHECK-LABEL: name: foo
+    ; CHECK: liveins: $lr
+    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    ; CHECK: TCRETURNdi @OUTLINED_FUNCTION_0, 0, implicit $sp, implicit-def $w12, implicit-def $w13, implicit-def $w14, implicit-def $w15
+    frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    $w12 = ORRWri $wzr, 1
+    $w13 = ORRWri $wzr, 2
+    $w14 = ORRWri $wzr, 3
+    $w15 = ORRWri $wzr, 4
+    RET undef $lr
+...
+---
+name:            bar
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+    ; CHECK-LABEL: name: bar
+    ; CHECK: liveins: $lr
+    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    ; CHECK: TCRETURNdi @OUTLINED_FUNCTION_0, 0, implicit $sp, implicit-def $w12, implicit-def $w13, implicit-def $w14, implicit-def $w15
+    frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    $w12 = ORRWri $wzr, 1
+    $w13 = ORRWri $wzr, 2
+    $w14 = ORRWri $wzr, 3
+    $w15 = ORRWri $wzr, 4
+    RET undef $lr
+...
+---
+name:            baz
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+    ; CHECK-LABEL: name: baz
+    ; CHECK: liveins: $lr
+    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    ; CHECK: TCRETURNdi @OUTLINED_FUNCTION_0, 0, implicit $sp, implicit-def $w12, implicit-def $w13, implicit-def $w14, implicit-def $w15
+    frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    $w12 = ORRWri $wzr, 1
+    $w13 = ORRWri $wzr, 2
+    $w14 = ORRWri $wzr, 3
+    $w15 = ORRWri $wzr, 4
+    RET undef $lr
diff --git a/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir b/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir
index eb244190e562e7..4503ed12cb6fb2 100644
--- a/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir
@@ -5,7 +5,7 @@
 # CHECK-NEXT: - basic block: %bb.0
 # CHECK-NEXT: - instruction: 48B	dead undef %2.sub0:vreg_128 = COPY %0.sub0:vreg_128
 # CHECK-NEXT: - operand 1:   %0.sub0:vreg_128
-# CHECK-NEXT: - interval:    %0 [16r,48r:0)  0@16r L00000002 [16r,32r:0)  0@16r weight:0.000000e+00
+# CHECK-NEXT: - interval:    %0 [16r,48r:0)  0@16r L0000000000000002 [16r,32r:0)  0@16r weight:0.000000e+00
 
 # This used to assert with: !SR.empty() && "At least one value should be defined by this mask"
 
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index a9a91803b08e96..babb18f08576eb 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -1,3 +1,10 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; NOTE: The checks for opt are NOT added by the update script. Those
+;       checks are looking for the absence of specific metadata, which
+;       cannot be expressed reliably by the generated checks.
+
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=ISA
 ; RUN: opt --amdgpu-annotate-uniform -S %s |  FileCheck %s -check-prefix=UNIFORM
 ; RUN: opt --amdgpu-annotate-uniform --si-annotate-control-flow -S %s |  FileCheck %s -check-prefix=CONTROLFLOW
 
@@ -9,6 +16,56 @@
 target triple = "amdgcn-mesa-mesa3d"
 
 define amdgpu_ps void @main(i32 %0, float %1) {
+; ISA-LABEL: main:
+; ISA:       ; %bb.0: ; %start
+; ISA-NEXT:    v_readfirstlane_b32 s0, v0
+; ISA-NEXT:    s_mov_b32 m0, s0
+; ISA-NEXT:    s_mov_b32 s0, 0
+; ISA-NEXT:    v_interp_p1_f32_e32 v0, v1, attr0.x
+; ISA-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
+; ISA-NEXT:    s_mov_b64 s[2:3], 0
+; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; ISA-NEXT:    s_branch BB0_3
+; ISA-NEXT:  BB0_1: ; %Flow1
+; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT:    s_add_i32 s0, s0, 1
+; ISA-NEXT:    s_mov_b64 s[8:9], 0
+; ISA-NEXT:  BB0_2: ; %Flow
+; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_and_b64 s[10:11], exec, s[6:7]
+; ISA-NEXT:    s_or_b64 s[2:3], s[10:11], s[2:3]
+; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; ISA-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; ISA-NEXT:    s_cbranch_execz BB0_6
+; ISA-NEXT:  BB0_3: ; %loop
+; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], exec
+; ISA-NEXT:    s_cmp_lt_u32 s0, 32
+; ISA-NEXT:    s_mov_b64 s[8:9], -1
+; ISA-NEXT:    s_cbranch_scc0 BB0_2
+; ISA-NEXT:  ; %bb.4: ; %endif1
+; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_mov_b64 s[6:7], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; ISA-NEXT:    s_cbranch_execz BB0_1
+; ISA-NEXT:  ; %bb.5: ; %endif2
+; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT:    s_branch BB0_1
+; ISA-NEXT:  BB0_6: ; %Flow2
+; ISA-NEXT:    s_or_b64 exec, exec, s[2:3]
+; ISA-NEXT:    v_mov_b32_e32 v1, 0
+; ISA-NEXT:    s_and_saveexec_b64 s[0:1], s[4:5]
+; ISA-NEXT:  ; %bb.7: ; %if1
+; ISA-NEXT:    v_sqrt_f32_e32 v1, v0
+; ISA-NEXT:  ; %bb.8: ; %endloop
+; ISA-NEXT:    s_or_b64 exec, exec, s[0:1]
+; ISA-NEXT:    exp mrt0 v1, v1, v1, v1 done vm
+; ISA-NEXT:    s_endpgm
 start:
   %v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0)
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 75ad58df43b347..c18a076aad4e6c 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -1,13 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify %s | FileCheck -check-prefix=IR %s
 
-; SI-LABEL: {{^}}infinite_loop:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
-; SI: [[LOOP:BB[0-9]+_[0-9]+]]:  ; %loop
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG]]
-; SI: s_branch [[LOOP]]
 define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
+; SI-LABEL: infinite_loop:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:  BB0_1: ; %loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_branch BB0_1
+; IR-LABEL: @infinite_loop(
+; IR-NEXT:  entry:
+; IR-NEXT:    br label [[LOOP:%.*]]
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    br label [[LOOP]]
+;
 entry:
   br label %loop
 
@@ -16,31 +30,36 @@ loop:
   br label %loop
 }
 
-
-; IR-LABEL: @infinite_loop_ret(
-; IR:  br i1 %cond, label %loop, label %UnifiedReturnBlock
-
-; IR: loop:
-; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4
-; IR: br i1 true, label %loop, label %UnifiedReturnBlock
-
-; IR: UnifiedReturnBlock:
-; IR:  ret void
-
-
-; SI-LABEL: {{^}}infinite_loop_ret:
-; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]]
-
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
-; SI: s_and_b64 vcc, exec, -1
-; SI: [[LOOP:BB[0-9]+_[0-9]+]]:  ; %loop
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG]]
-; SI: s_cbranch_vccnz [[LOOP]]
-
-; SI: [[RET]]:  ; %UnifiedReturnBlock
-; SI: s_endpgm
 define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
+; SI-LABEL: infinite_loop_ret:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_cbranch_execz BB1_3
+; SI-NEXT:  ; %bb.1: ; %loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_and_b64 vcc, exec, -1
+; SI-NEXT:  BB1_2: ; %loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_cbranch_vccnz BB1_2
+; SI-NEXT:  BB1_3: ; %UnifiedReturnBlock
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_ret(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %cond = icmp eq i32 %tmp, 1
@@ -54,44 +73,44 @@ return:
   ret void
 }
 
-
-; IR-LABEL: @infinite_loops(
-; IR: br i1 undef, label %loop1, label %loop2
-
-; IR: loop1:
-; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4
-; IR: br i1 true, label %loop1, label %DummyReturnBlock
-
-; IR: loop2:
-; IR: store volatile i32 888, i32 addrspace(1)* %out, align 4
-; IR: br i1 true, label %loop2, label %DummyReturnBlock
-
-; IR: DummyReturnBlock:
-; IR: ret void
-
-
-; SI-LABEL: {{^}}infinite_loops:
-
-; SI: v_mov_b32_e32 [[REG1:v[0-9]+]], 0x3e7
-; SI: s_and_b64 vcc, exec, -1
-
-; SI: [[LOOP1:BB[0-9]+_[0-9]+]]:  ; %loop1
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG1]]
-; SI: s_cbranch_vccnz [[LOOP1]]
-; SI: s_branch [[RET:BB[0-9]+_[0-9]+]]
-
-; SI: v_mov_b32_e32 [[REG2:v[0-9]+]], 0x378
-; SI: s_and_b64 vcc, exec, -1
-
-; SI: [[LOOP2:BB[0-9]+_[0-9]+]]:  ; %loop2
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG2]]
-; SI: s_cbranch_vccnz [[LOOP2]]
-
-; SI: [[RET]]:  ; %DummyReturnBlock
-; SI: s_endpgm
 define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
+; SI-LABEL: infinite_loops:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_cbranch_scc0 BB2_3
+; SI-NEXT:  ; %bb.1: ; %loop1.preheader
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_and_b64 vcc, exec, -1
+; SI-NEXT:  BB2_2: ; %loop1
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_cbranch_vccnz BB2_2
+; SI-NEXT:    s_branch BB2_5
+; SI-NEXT:  BB2_3:
+; SI-NEXT:    v_mov_b32_e32 v0, 0x378
+; SI-NEXT:    s_and_b64 vcc, exec, -1
+; SI-NEXT:  BB2_4: ; %loop2
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_cbranch_vccnz BB2_4
+; SI-NEXT:  BB2_5: ; %DummyReturnBlock
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loops(
+; IR-NEXT:  entry:
+; IR-NEXT:    br i1 undef, label [[LOOP1:%.*]], label [[LOOP2:%.*]]
+; IR:       loop1:
+; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[LOOP1]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       loop2:
+; IR-NEXT:    store volatile i32 888, i32 addrspace(1)* [[OUT]], align 4
+; IR-NEXT:    br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]]
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
 entry:
   br i1 undef, label %loop1, label %loop2
 
@@ -104,55 +123,68 @@ loop2:
   br label %loop2
 }
 
-
-
-; IR-LABEL: @infinite_loop_nest_ret(
-; IR: br i1 %cond1, label %outer_loop, label %UnifiedReturnBlock
-
-; IR: outer_loop:
-; IR: br label %inner_loop
-
-; IR: inner_loop:
-; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4
-; IR: %cond3 = icmp eq i32 %tmp, 3
-; IR: br i1 true, label %TransitionBlock, label %UnifiedReturnBlock
-
-; IR: TransitionBlock:
-; IR: br i1 %cond3, label %inner_loop, label %outer_loop
-
-; IR: UnifiedReturnBlock:
-; IR: ret void
-
-; SI-LABEL: {{^}}infinite_loop_nest_ret:
-; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]]
-
-; SI: s_mov_b32
-; SI: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]:  ; %outer_loop
-
-; SI: [[INNER_LOOP:BB[0-9]+_[0-9]+]]:  ; %inner_loop
-; SI: s_waitcnt expcnt(0)
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
-; SI: s_waitcnt lgkmcnt(0)
-; SI: buffer_store_dword [[REG]]
-
-; SI: s_andn2_b64 exec
-; SI: s_cbranch_execnz [[INNER_LOOP]]
-
-; SI: s_andn2_b64 exec
-; SI: s_cbranch_execnz [[OUTER_LOOP]]
-
-; SI: [[RET]]:  ; %UnifiedReturnBlock
-; SI: s_endpgm
 define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
+; SI-LABEL: infinite_loop_nest_ret:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_cbranch_execz BB3_5
+; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:  BB3_2: ; %outer_loop
+; SI-NEXT:    ; =>This Loop Header: Depth=1
+; SI-NEXT:    ; Child Loop BB3_3 Depth 2
+; SI-NEXT:    s_and_b64 s[8:9], exec, vcc
+; SI-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], 0
+; SI-NEXT:  BB3_3: ; %inner_loop
+; SI-NEXT:    ; Parent Loop BB3_2 Depth=1
+; SI-NEXT:    ; => This Inner Loop Header: Depth=2
+; SI-NEXT:    s_and_b64 s[10:11], exec, s[0:1]
+; SI-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT:    s_cbranch_execnz BB3_3
+; SI-NEXT:  ; %bb.4: ; %Flow
+; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_cbranch_execnz BB3_2
+; SI-NEXT:  BB3_5: ; %UnifiedReturnBlock
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_nest_ret(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND1:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT:    br i1 [[COND1]], label [[OUTER_LOOP:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       outer_loop:
+; IR-NEXT:    br label [[INNER_LOOP:%.*]]
+; IR:       inner_loop:
+; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK]]
+; IR:       TransitionBlock:
+; IR-NEXT:    br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %cond1 = icmp eq i32 %tmp, 1
   br i1 %cond1, label %outer_loop, label %return
 
 outer_loop:
- ; %cond2 = icmp eq i32 %tmp, 2
- ; br i1 %cond2, label %outer_loop, label %inner_loop
- br label %inner_loop
+  ; %cond2 = icmp eq i32 %tmp, 2
+  ; br i1 %cond2, label %outer_loop, label %inner_loop
+  br label %inner_loop
 
 inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
   store volatile i32 999, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index d02d406689a9c0..b9788e8babf533 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -1,56 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
 ; RUN: llc -march=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck -check-prefix=GCN %s
 
 ; Uses llvm.amdgcn.break
 
-; OPT-LABEL: @break_loop(
-; OPT: bb1:
-; OPT: icmp slt i32
-; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
-
-; OPT: bb4:
-; OPT: load volatile
-; OPT: icmp slt i32
-; OPT: xor i1 %cmp1
-; OPT: br label %Flow
-
-; OPT: Flow:
-; OPT: call i64 @llvm.amdgcn.if.break.i64(
-; OPT: call i1 @llvm.amdgcn.loop.i64(i64
-; OPT: br i1 %{{[0-9]+}}, label %bb9, label %bb1
-
-; OPT: bb9:
-; OPT: call void @llvm.amdgcn.end.cf.i64(i64
-
-; GCN-LABEL: {{^}}break_loop:
-; GCN:      s_mov_b64         [[ACCUM_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
-
-; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN:     s_add_i32 s6, s6, 1
-; GCN:     s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec
-; GCN:     s_cmp_gt_i32 s6, -1
-; GCN:     s_cbranch_scc1   [[FLOW:BB[0-9]+_[0-9]+]]
-
-; GCN: ; %bb4
-; GCN:      buffer_load_dword
-; GCN:      v_cmp_ge_i32_e32  vcc
-; GCN:      s_andn2_b64 [[INNER_MASK]], [[INNER_MASK]], exec
-; GCN:      s_and_b64 [[BROKEN_MASK:s\[[0-9]+:[0-9]+\]]], vcc, exec
-; GCN:      s_or_b64  [[INNER_MASK]], [[INNER_MASK]], [[BROKEN_MASK]]
-
-; GCN: [[FLOW]]: ; %Flow
-; GCN:           ;   in Loop: Header=BB0_1 Depth=1
-; GCN:      s_and_b64         [[AND_MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]]
-; GCN-NEXT: s_or_b64          [[ACCUM_MASK]], [[AND_MASK]], [[ACCUM_MASK]]
-; GCN-NEXT: s_andn2_b64       exec, exec, [[ACCUM_MASK]]
-; GCN-NEXT: s_cbranch_execnz  [[LOOP_ENTRY]]
-
-; GCN: ; %bb.4: ; %bb9
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @break_loop(i32 %arg) #0 {
+; OPT-LABEL: @break_loop(
+; OPT-NEXT:  bb:
+; OPT-NEXT:    [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; OPT-NEXT:    [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]]
+; OPT-NEXT:    br label [[BB1:%.*]]
+; OPT:       bb1:
+; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP2:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ]
+; OPT-NEXT:    [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[LSR_IV_NEXT:%.*]], [[FLOW]] ]
+; OPT-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 1
+; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
+; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
+; OPT:       bb4:
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[MY_TMP]], [[LOAD]]
+; OPT-NEXT:    [[TMP0:%.*]] = xor i1 [[CMP1]], true
+; OPT-NEXT:    br label [[FLOW]]
+; OPT:       Flow:
+; OPT-NEXT:    [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[BB4]] ], [ true, [[BB1]] ]
+; OPT-NEXT:    [[TMP2]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP1]], i64 [[PHI_BROKEN]])
+; OPT-NEXT:    [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP2]])
+; OPT-NEXT:    br i1 [[TMP3]], label [[BB9:%.*]], label [[BB1]]
+; OPT:       bb9:
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; OPT-NEXT:    ret void
+;
+; GCN-LABEL: break_loop:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x9
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr6
+; GCN-NEXT:  BB0_1: ; %bb1
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_add_i32 s6, s6, 1
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cmp_gt_i32 s6, -1
+; GCN-NEXT:    s_cbranch_scc1 BB0_3
+; GCN-NEXT:  ; %bb.2: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT:  BB0_3: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; GCN-NEXT:    s_and_b64 s[8:9], exec, s[4:5]
+; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_cbranch_execnz BB0_1
+; GCN-NEXT:  ; %bb.4: ; %bb9
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp = sub i32 %id, %arg
+  %my.tmp = sub i32 %id, %arg
   br label %bb1
 
 bb1:
@@ -61,58 +76,98 @@ bb1:
 
 bb4:
   %load = load volatile i32, i32 addrspace(1)* undef, align 4
-  %cmp1 = icmp slt i32 %tmp, %load
+  %cmp1 = icmp slt i32 %my.tmp, %load
   br i1 %cmp1, label %bb1, label %bb9
 
 bb9:
   ret void
 }
 
-; OPT-LABEL: @undef_phi_cond_break_loop(
-; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
-; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
-; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
-; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
-
-; OPT: bb4:
-; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
-; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: br label %Flow
-
-; OPT: Flow:
-; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break.i64(i1 %tmp3, i64 %phi.broken)
-; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop.i64(i64 %0)
-; OPT-NEXT: br i1 %1, label %bb9, label %bb1
-
-; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %0)
-; OPT-NEXT: store volatile i32 7
-; OPT-NEXT: ret void
 define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
+; OPT-LABEL: @undef_phi_cond_break_loop(
+; OPT-NEXT:  bb:
+; OPT-NEXT:    [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; OPT-NEXT:    [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]]
+; OPT-NEXT:    br label [[BB1:%.*]]
+; OPT:       bb1:
+; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ]
+; OPT-NEXT:    [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ]
+; OPT-NEXT:    [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1
+; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
+; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
+; OPT:       bb4:
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
+; OPT-NEXT:    br label [[FLOW]]
+; OPT:       Flow:
+; OPT-NEXT:    [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ]
+; OPT-NEXT:    [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ undef, [[BB1]] ]
+; OPT-NEXT:    [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]])
+; OPT-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]])
+; OPT-NEXT:    br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]]
+; OPT:       bb9:
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]])
+; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef
+; OPT-NEXT:    ret void
+;
+; GCN-LABEL: undef_phi_cond_break_loop:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x9
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GCN-NEXT:    ; implicit-def: $sgpr4
+; GCN-NEXT:  BB1_1: ; %bb1
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_and_b64 s[8:9], s[0:1], exec
+; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GCN-NEXT:    s_cmp_gt_i32 s4, -1
+; GCN-NEXT:    s_cbranch_scc1 BB1_3
+; GCN-NEXT:  ; %bb.2: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GCN-NEXT:  BB1_3: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; GCN-NEXT:    s_add_i32 s4, s4, 1
+; GCN-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
+; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_cbranch_execnz BB1_1
+; GCN-NEXT:  ; %bb.4: ; %bb9
+; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, 7
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_write_b32 v0, v0
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp = sub i32 %id, %arg
+  %my.tmp = sub i32 %id, %arg
   br label %bb1
 
 bb1:                                              ; preds = %Flow, %bb
-  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ]
   %lsr.iv.next = add i32 %lsr.iv, 1
   %cmp0 = icmp slt i32 %lsr.iv.next, 0
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
   %load = load volatile i32, i32 addrspace(1)* undef, align 4
-  %cmp1 = icmp sge i32 %tmp, %load
+  %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
 Flow:                                             ; preds = %bb4, %bb1
-  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ undef, %bb1 ]
-  br i1 %tmp3, label %bb9, label %bb1
+  %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %my.tmp3 = phi i1 [ %cmp1, %bb4 ], [ undef, %bb1 ]
+  br i1 %my.tmp3, label %bb9, label %bb1
 
 bb9:                                              ; preds = %Flow
   store volatile i32 7, i32 addrspace(3)* undef
@@ -122,152 +177,271 @@ bb9:                                              ; preds = %Flow
 ; FIXME: ConstantExpr compare of address to null folds away
 @lds = addrspace(3) global i32 undef
 
-; OPT-LABEL: @constexpr_phi_cond_break_loop(
-; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
-; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
-; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
-; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
-
-; OPT: bb4:
-; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
-; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: br label %Flow
-
-; OPT: Flow:
-; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ]
-; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break.i64(i1 %tmp3, i64 %phi.broken)
-; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop.i64(i64 %0)
-; OPT-NEXT: br i1 %1, label %bb9, label %bb1
-
-; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %0)
-; OPT-NEXT: store volatile i32 7
-; OPT-NEXT: ret void
 define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
+; OPT-LABEL: @constexpr_phi_cond_break_loop(
+; OPT-NEXT:  bb:
+; OPT-NEXT:    [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; OPT-NEXT:    [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]]
+; OPT-NEXT:    br label [[BB1:%.*]]
+; OPT:       bb1:
+; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ]
+; OPT-NEXT:    [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ]
+; OPT-NEXT:    [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1
+; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
+; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
+; OPT:       bb4:
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
+; OPT-NEXT:    br label [[FLOW]]
+; OPT:       Flow:
+; OPT-NEXT:    [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ]
+; OPT-NEXT:    [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), [[BB1]] ]
+; OPT-NEXT:    [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]])
+; OPT-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]])
+; OPT-NEXT:    br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]]
+; OPT:       bb9:
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]])
+; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef
+; OPT-NEXT:    ret void
+;
+; GCN-LABEL: constexpr_phi_cond_break_loop:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x9
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_mov_b32 s2, lds@abs32@lo
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr3
+; GCN-NEXT:  BB2_1: ; %bb1
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[8:9], s2, 4
+; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT:    s_cmp_gt_i32 s3, -1
+; GCN-NEXT:    s_cbranch_scc1 BB2_3
+; GCN-NEXT:  ; %bb.2: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB2_1 Depth=1
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT:  BB2_3: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB2_1 Depth=1
+; GCN-NEXT:    s_add_i32 s3, s3, 1
+; GCN-NEXT:    s_and_b64 s[8:9], exec, s[4:5]
+; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_cbranch_execnz BB2_1
+; GCN-NEXT:  ; %bb.4: ; %bb9
+; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, 7
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_write_b32 v0, v0
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp = sub i32 %id, %arg
+  %my.tmp = sub i32 %id, %arg
   br label %bb1
 
 bb1:                                              ; preds = %Flow, %bb
-  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ]
   %lsr.iv.next = add i32 %lsr.iv, 1
   %cmp0 = icmp slt i32 %lsr.iv.next, 0
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
   %load = load volatile i32, i32 addrspace(1)* undef, align 4
-  %cmp1 = icmp sge i32 %tmp, %load
+  %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
 Flow:                                             ; preds = %bb4, %bb1
-  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ]
-  br i1 %tmp3, label %bb9, label %bb1
+  %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %my.tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ]
+  br i1 %my.tmp3, label %bb9, label %bb1
 
 bb9:                                              ; preds = %Flow
   store volatile i32 7, i32 addrspace(3)* undef
   ret void
 }
 
-; OPT-LABEL: @true_phi_cond_break_loop(
-; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
-; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
-; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
-; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
-
-; OPT: bb4:
-; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
-; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: br label %Flow
-
-; OPT: Flow:
-; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
-; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break.i64(i1 %tmp3, i64 %phi.broken)
-; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop.i64(i64 %0)
-; OPT-NEXT: br i1 %1, label %bb9, label %bb1
-
-; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %0)
-; OPT-NEXT: store volatile i32 7
-; OPT-NEXT: ret void
 define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
+; OPT-LABEL: @true_phi_cond_break_loop(
+; OPT-NEXT:  bb:
+; OPT-NEXT:    [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; OPT-NEXT:    [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]]
+; OPT-NEXT:    br label [[BB1:%.*]]
+; OPT:       bb1:
+; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ]
+; OPT-NEXT:    [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ]
+; OPT-NEXT:    [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1
+; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
+; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
+; OPT:       bb4:
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
+; OPT-NEXT:    br label [[FLOW]]
+; OPT:       Flow:
+; OPT-NEXT:    [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ]
+; OPT-NEXT:    [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ true, [[BB1]] ]
+; OPT-NEXT:    [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]])
+; OPT-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]])
+; OPT-NEXT:    br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]]
+; OPT:       bb9:
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]])
+; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef
+; OPT-NEXT:    ret void
+;
+; GCN-LABEL: true_phi_cond_break_loop:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x9
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr6
+; GCN-NEXT:  BB3_1: ; %bb1
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cmp_gt_i32 s6, -1
+; GCN-NEXT:    s_cbranch_scc1 BB3_3
+; GCN-NEXT:  ; %bb.2: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT:  BB3_3: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; GCN-NEXT:    s_add_i32 s6, s6, 1
+; GCN-NEXT:    s_and_b64 s[8:9], exec, s[4:5]
+; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_cbranch_execnz BB3_1
+; GCN-NEXT:  ; %bb.4: ; %bb9
+; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, 7
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_write_b32 v0, v0
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp = sub i32 %id, %arg
+  %my.tmp = sub i32 %id, %arg
   br label %bb1
 
 bb1:                                              ; preds = %Flow, %bb
-  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ]
   %lsr.iv.next = add i32 %lsr.iv, 1
   %cmp0 = icmp slt i32 %lsr.iv.next, 0
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
   %load = load volatile i32, i32 addrspace(1)* undef, align 4
-  %cmp1 = icmp sge i32 %tmp, %load
+  %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
 Flow:                                             ; preds = %bb4, %bb1
-  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
-  br i1 %tmp3, label %bb9, label %bb1
+  %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %my.tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+  br i1 %my.tmp3, label %bb9, label %bb1
 
 bb9:                                              ; preds = %Flow
   store volatile i32 7, i32 addrspace(3)* undef
   ret void
 }
 
-; OPT-LABEL: @false_phi_cond_break_loop(
-; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
-; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT-NOT: call
-; OPT: br i1 %cmp0, label %bb4, label %Flow
-
-; OPT: bb4:
-; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
-; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: br label %Flow
-
-; OPT: Flow:
-; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ false, %bb1 ]
-; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break.i64(i1 %tmp3, i64 %phi.broken)
-; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop.i64(i64 %0)
-; OPT-NEXT: br i1 %1, label %bb9, label %bb1
-
-; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %0)
-; OPT-NEXT: store volatile i32 7
-; OPT-NEXT: ret void
 define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
+; OPT-LABEL: @false_phi_cond_break_loop(
+; OPT-NEXT:  bb:
+; OPT-NEXT:    [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; OPT-NEXT:    [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]]
+; OPT-NEXT:    br label [[BB1:%.*]]
+; OPT:       bb1:
+; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ]
+; OPT-NEXT:    [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ]
+; OPT-NEXT:    [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1
+; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
+; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
+; OPT:       bb4:
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
+; OPT-NEXT:    br label [[FLOW]]
+; OPT:       Flow:
+; OPT-NEXT:    [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ]
+; OPT-NEXT:    [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ false, [[BB1]] ]
+; OPT-NEXT:    [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]])
+; OPT-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]])
+; OPT-NEXT:    br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]]
+; OPT:       bb9:
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]])
+; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef
+; OPT-NEXT:    ret void
+;
+; GCN-LABEL: false_phi_cond_break_loop:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x9
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr6
+; GCN-NEXT:  BB4_1: ; %bb1
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cmp_gt_i32 s6, -1
+; GCN-NEXT:    s_cbranch_scc1 BB4_3
+; GCN-NEXT:  ; %bb.2: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB4_1 Depth=1
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT:  BB4_3: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB4_1 Depth=1
+; GCN-NEXT:    s_add_i32 s6, s6, 1
+; GCN-NEXT:    s_and_b64 s[8:9], exec, s[4:5]
+; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_cbranch_execnz BB4_1
+; GCN-NEXT:  ; %bb.4: ; %bb9
+; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, 7
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_write_b32 v0, v0
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp = sub i32 %id, %arg
+  %my.tmp = sub i32 %id, %arg
   br label %bb1
 
 bb1:                                              ; preds = %Flow, %bb
-  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ]
   %lsr.iv.next = add i32 %lsr.iv, 1
   %cmp0 = icmp slt i32 %lsr.iv.next, 0
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
   %load = load volatile i32, i32 addrspace(1)* undef, align 4
-  %cmp1 = icmp sge i32 %tmp, %load
+  %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
 Flow:                                             ; preds = %bb4, %bb1
-  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ false, %bb1 ]
-  br i1 %tmp3, label %bb9, label %bb1
+  %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %my.tmp3 = phi i1 [ %cmp1, %bb4 ], [ false, %bb1 ]
+  br i1 %my.tmp3, label %bb9, label %bb1
 
 bb9:                                              ; preds = %Flow
   store volatile i32 7, i32 addrspace(3)* undef
@@ -277,52 +451,91 @@ bb9:                                              ; preds = %Flow
 ; Swap order of branches in flow block so that the true phi is
 ; continue.
 
-; OPT-LABEL: @invert_true_phi_cond_break_loop(
-; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %1, %Flow ], [ 0, %bb ]
-; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
-; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
-; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
-
-; OPT: bb4:
-; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
-; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: br label %Flow
-
-; OPT: Flow:
-; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
-; OPT-NEXT: %0 = xor i1 %tmp3, true
-; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break.i64(i1 %0, i64 %phi.broken)
-; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop.i64(i64 %1)
-; OPT-NEXT: br i1 %2, label %bb9, label %bb1
-
-; OPT: bb9:
-; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %1)
-; OPT-NEXT: store volatile i32 7, i32 addrspace(3)* undef
-; OPT-NEXT: ret void
 define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
+; OPT-LABEL: @invert_true_phi_cond_break_loop(
+; OPT-NEXT:  bb:
+; OPT-NEXT:    [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; OPT-NEXT:    [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]]
+; OPT-NEXT:    br label [[BB1:%.*]]
+; OPT:       bb1:
+; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP1:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ]
+; OPT-NEXT:    [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ]
+; OPT-NEXT:    [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1
+; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
+; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
+; OPT:       bb4:
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
+; OPT-NEXT:    br label [[FLOW]]
+; OPT:       Flow:
+; OPT-NEXT:    [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ]
+; OPT-NEXT:    [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ true, [[BB1]] ]
+; OPT-NEXT:    [[TMP0:%.*]] = xor i1 [[MY_TMP3]], true
+; OPT-NEXT:    [[TMP1]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP0]], i64 [[PHI_BROKEN]])
+; OPT-NEXT:    [[TMP2:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP1]])
+; OPT-NEXT:    br i1 [[TMP2]], label [[BB9:%.*]], label [[BB1]]
+; OPT:       bb9:
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP1]])
+; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef
+; OPT-NEXT:    ret void
+;
+; GCN-LABEL: invert_true_phi_cond_break_loop:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x9
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr6
+; GCN-NEXT:  BB5_1: ; %bb1
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cmp_gt_i32 s6, -1
+; GCN-NEXT:    s_cbranch_scc1 BB5_3
+; GCN-NEXT:  ; %bb.2: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB5_1 Depth=1
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT:  BB5_3: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB5_1 Depth=1
+; GCN-NEXT:    s_add_i32 s6, s6, 1
+; GCN-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-NEXT:    s_and_b64 s[8:9], exec, s[8:9]
+; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_cbranch_execnz BB5_1
+; GCN-NEXT:  ; %bb.4: ; %bb9
+; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, 7
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_write_b32 v0, v0
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp = sub i32 %id, %arg
+  %my.tmp = sub i32 %id, %arg
   br label %bb1
 
 bb1:                                              ; preds = %Flow, %bb
-  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ]
   %lsr.iv.next = add i32 %lsr.iv, 1
   %cmp0 = icmp slt i32 %lsr.iv.next, 0
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
   %load = load volatile i32, i32 addrspace(1)* undef, align 4
-  %cmp1 = icmp sge i32 %tmp, %load
+  %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
 Flow:                                             ; preds = %bb4, %bb1
-  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
-  br i1 %tmp3, label %bb1, label %bb9
+  %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %my.tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+  br i1 %my.tmp3, label %bb1, label %bb9
 
 bb9:                                              ; preds = %Flow
   store volatile i32 7, i32 addrspace(3)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 6e846fd56498ca..e6712277a90b44 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -1,3 +1,5 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -6,74 +8,89 @@
 ; the condition that appears to have no uses until the loop is
 ; completely processed.
 
-
-; IR-LABEL: @reduced_nested_loop_conditions(
-
-; IR: bb5:
-; IR-NEXT: %phi.broken = phi i64 [ %3, %bb10 ], [ 0, %bb ]
-; IR-NEXT: %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
-; IR-NEXT: %tmp7 = icmp eq i32 %tmp6, 1
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %tmp7)
-; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
-; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
-; IR-NEXT: br i1 %1, label %bb8, label %Flow
-
-; IR: bb8:
-; IR-NEXT: br label %bb13
-
-; IR: bb10:
-; IR-NEXT: %tmp11 = phi i32 [ %6, %Flow ]
-; IR-NEXT: %tmp12 = phi i1 [ %5, %Flow ]
-; IR-NEXT: %3 = call i64 @llvm.amdgcn.if.break.i64(i1 %tmp12, i64 %phi.broken)
-; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop.i64(i64 %3)
-; IR-NEXT: br i1 %4, label %bb23, label %bb5
-
-; IR: Flow:
-; IR-NEXT: %5 = phi i1 [ %tmp22, %bb4 ], [ true, %bb5 ]
-; IR-NEXT: %6 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %2)
-; IR-NEXT: br label %bb10
-
-; IR: bb13:
-; IR-NEXT: %tmp14 = phi i1 [ %tmp22, %bb3 ], [ true, %bb8 ]
-; IR-NEXT: %tmp15 = bitcast i64 %tmp2 to <2 x i32>
-; IR-NEXT: br i1 %tmp14, label %bb16, label %bb20
-
-; IR: bb16:
-; IR-NEXT: %tmp17 = extractelement <2 x i32> %tmp15, i64 1
-; IR-NEXT: %tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17
-; IR-NEXT: %tmp19 = load volatile i32, i32 addrspace(3)* %tmp18
-; IR-NEXT: br label %bb20
-
-; IR: bb20:
-; IR-NEXT: %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
-; IR-NEXT: %tmp22 = phi i1 [ false, %bb16 ], [ %tmp14, %bb13 ]
-; IR-NEXT: br label %bb9
-
-; IR: bb23:
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %3)
-; IR-NEXT: ret void
-
-; GCN-LABEL: {{^}}reduced_nested_loop_conditions:
-
-; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
-; GCN-NEXT: s_cbranch_scc0
-
-; FIXME: Should fold to unconditional branch?
-; GCN: ; implicit-def
-; GCN: s_cbranch_vccnz
-
-; GCN: ds_read_b32
-
-; GCN: [[BB9:BB[0-9]+_[0-9]+]]: ; %bb9
-; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_mov_b64 vcc, vcc
-; GCN-NEXT: s_cbranch_vccnz [[BB9]]
 define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* nocapture %arg) #0 {
+; GCN-LABEL: reduced_nested_loop_conditions:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    ds_read_b64 v[0:1], v0
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_and_b64 vcc, exec, 0
+; GCN-NEXT:  BB0_1: ; %bb5
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc0 BB0_3
+; GCN-NEXT:  ; %bb.2: ; %bb10
+; GCN-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; GCN-NEXT:    ; implicit-def: $sgpr0
+; GCN-NEXT:    s_cbranch_vccnz BB0_1
+; GCN-NEXT:    s_branch BB0_5
+; GCN-NEXT:  BB0_3: ; %bb8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_and_b64 vcc, exec, -1
+; GCN-NEXT:  BB0_4: ; %bb9
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_cbranch_vccnz BB0_4
+; GCN-NEXT:  BB0_5: ; %DummyReturnBlock
+; GCN-NEXT:    s_endpgm
+; IR-LABEL: @reduced_nested_loop_conditions(
+; IR-NEXT:  bb:
+; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
+; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, i64 addrspace(3)* [[ARG:%.*]], i32 [[MY_TMP]]
+; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, i64 addrspace(3)* [[MY_TMP1]]
+; IR-NEXT:    br label [[BB5:%.*]]
+; IR:       bb3:
+; IR-NEXT:    br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
+; IR:       bb4:
+; IR-NEXT:    br label [[FLOW:%.*]]
+; IR:       bb5:
+; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP3:%.*]], [[BB10:%.*]] ], [ 0, [[BB:%.*]] ]
+; IR-NEXT:    [[MY_TMP6:%.*]] = phi i32 [ 0, [[BB]] ], [ [[MY_TMP11:%.*]], [[BB10]] ]
+; IR-NEXT:    [[MY_TMP7:%.*]] = icmp eq i32 [[MY_TMP6]], 1
+; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP7]])
+; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; IR-NEXT:    br i1 [[TMP1]], label [[BB8:%.*]], label [[FLOW]]
+; IR:       bb8:
+; IR-NEXT:    br label [[BB13]]
+; IR:       bb9:
+; IR-NEXT:    br i1 false, label [[BB3:%.*]], label [[BB9:%.*]]
+; IR:       bb10:
+; IR-NEXT:    [[MY_TMP11]] = phi i32 [ [[TMP6:%.*]], [[FLOW]] ]
+; IR-NEXT:    [[MY_TMP12:%.*]] = phi i1 [ [[TMP5:%.*]], [[FLOW]] ]
+; IR-NEXT:    [[TMP3]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP12]], i64 [[PHI_BROKEN]])
+; IR-NEXT:    [[TMP4:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP3]])
+; IR-NEXT:    br i1 [[TMP4]], label [[BB23:%.*]], label [[BB5]]
+; IR:       Flow:
+; IR-NEXT:    [[TMP5]] = phi i1 [ [[MY_TMP22:%.*]], [[BB4]] ], [ true, [[BB5]] ]
+; IR-NEXT:    [[TMP6]] = phi i32 [ [[MY_TMP21:%.*]], [[BB4]] ], [ undef, [[BB5]] ]
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; IR-NEXT:    br label [[BB10]]
+; IR:       bb13:
+; IR-NEXT:    [[MY_TMP14:%.*]] = phi i1 [ [[MY_TMP22]], [[BB3]] ], [ true, [[BB8]] ]
+; IR-NEXT:    [[MY_TMP15:%.*]] = bitcast i64 [[MY_TMP2]] to <2 x i32>
+; IR-NEXT:    br i1 [[MY_TMP14]], label [[BB16:%.*]], label [[BB20:%.*]]
+; IR:       bb16:
+; IR-NEXT:    [[MY_TMP17:%.*]] = extractelement <2 x i32> [[MY_TMP15]], i64 1
+; IR-NEXT:    [[MY_TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 [[MY_TMP17]]
+; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[MY_TMP18]]
+; IR-NEXT:    br label [[BB20]]
+; IR:       bb20:
+; IR-NEXT:    [[MY_TMP21]] = phi i32 [ [[MY_TMP19]], [[BB16]] ], [ 0, [[BB13]] ]
+; IR-NEXT:    [[MY_TMP22]] = phi i1 [ false, [[BB16]] ], [ [[MY_TMP14]], [[BB13]] ]
+; IR-NEXT:    br label [[BB9]]
+; IR:       bb23:
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]])
+; IR-NEXT:    ret void
+;
 bb:
-  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %tmp
-  %tmp2 = load volatile i64, i64 addrspace(3)* %tmp1
+  %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %my.tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %my.tmp
+  %my.tmp2 = load volatile i64, i64 addrspace(3)* %my.tmp1
   br label %bb5
 
 bb3:                                              ; preds = %bb9
@@ -83,9 +100,9 @@ bb4:                                              ; preds = %bb3
   br label %bb10
 
 bb5:                                              ; preds = %bb10, %bb
-  %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
-  %tmp7 = icmp eq i32 %tmp6, 1
-  br i1 %tmp7, label %bb8, label %bb10
+  %my.tmp6 = phi i32 [ 0, %bb ], [ %my.tmp11, %bb10 ]
+  %my.tmp7 = icmp eq i32 %my.tmp6, 1
+  br i1 %my.tmp7, label %bb8, label %bb10
 
 bb8:                                              ; preds = %bb5
   br label %bb13
@@ -94,24 +111,24 @@ bb9:                                              ; preds = %bb20, %bb9
   br i1 false, label %bb3, label %bb9
 
 bb10:                                             ; preds = %bb5, %bb4
-  %tmp11 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
-  %tmp12 = phi i1 [ %tmp22, %bb4 ], [ true, %bb5 ]
-  br i1 %tmp12, label %bb23, label %bb5
+  %my.tmp11 = phi i32 [ %my.tmp21, %bb4 ], [ undef, %bb5 ]
+  %my.tmp12 = phi i1 [ %my.tmp22, %bb4 ], [ true, %bb5 ]
+  br i1 %my.tmp12, label %bb23, label %bb5
 
 bb13:                                             ; preds = %bb8, %bb3
-  %tmp14 = phi i1 [ %tmp22, %bb3 ], [ true, %bb8 ]
-  %tmp15 = bitcast i64 %tmp2 to <2 x i32>
-  br i1 %tmp14, label %bb16, label %bb20
+  %my.tmp14 = phi i1 [ %my.tmp22, %bb3 ], [ true, %bb8 ]
+  %my.tmp15 = bitcast i64 %my.tmp2 to <2 x i32>
+  br i1 %my.tmp14, label %bb16, label %bb20
 
 bb16:                                             ; preds = %bb13
-  %tmp17 = extractelement <2 x i32> %tmp15, i64 1
-  %tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17
-  %tmp19 = load volatile i32, i32 addrspace(3)* %tmp18
+  %my.tmp17 = extractelement <2 x i32> %my.tmp15, i64 1
+  %my.tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %my.tmp17
+  %my.tmp19 = load volatile i32, i32 addrspace(3)* %my.tmp18
   br label %bb20
 
 bb20:                                             ; preds = %bb16, %bb13
-  %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
-  %tmp22 = phi i1 [ false, %bb16 ], [ %tmp14, %bb13 ]
+  %my.tmp21 = phi i32 [ %my.tmp19, %bb16 ], [ 0, %bb13 ]
+  %my.tmp22 = phi i1 [ false, %bb16 ], [ %my.tmp14, %bb13 ]
   br label %bb9
 
 bb23:                                             ; preds = %bb10
@@ -119,97 +136,146 @@ bb23:                                             ; preds = %bb10
 }
 
 ; Earlier version of above, before a run of the structurizer.
-; IR-LABEL: @nested_loop_conditions(
-
-; IR: Flow3:
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %21)
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %14)
-; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
-; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
-; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
-
-; IR: Flow4:
-; IR-NEXT: %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %2)
-; IR-NEXT: br label %Flow
-
-; IR: Flow:
-; IR-NEXT:  %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
-; IR-NEXT:  %5 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %4)
-; IR-NEXT:  %6 = extractvalue { i1, i64 } %5, 0
-; IR-NEXT:  %7 = extractvalue { i1, i64 } %5, 1
-; IR-NEXT:  br i1 %6, label %bb13, label %bb31
-
-; IR: bb14:
-; IR: %tmp15 = icmp eq i32 %tmp1037, 1
-; IR-NEXT: %8 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %tmp15)
-
-; IR: Flow1:
-; IR-NEXT: %11 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %bb14 ]
-; IR-NEXT: %12 = phi i32 [ %tmp10, %bb21 ], [ undef, %bb14 ]
-; IR-NEXT: %13 = phi i1 [ %18, %bb21 ], [ true, %bb14 ]
-; IR-NEXT: %14 = phi i1 [ %18, %bb21 ], [ false, %bb14 ]
-; IR-NEXT: %15 = phi i1 [ false, %bb21 ], [ true, %bb14 ]
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %10)
-; IR-NEXT: %16 = call i64 @llvm.amdgcn.if.break.i64(i1 %13, i64 %phi.broken)
-; IR-NEXT: %17 = call i1 @llvm.amdgcn.loop.i64(i64 %16)
-; IR-NEXT: br i1 %17, label %Flow2, label %bb14
-
-; IR: bb21:
-; IR: %tmp12 = icmp slt i32 %tmp11, 9
-; IR-NEXT: %18 = xor i1 %tmp12, true
-; IR-NEXT: br label %Flow1
-
-; IR: Flow2:
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %16)
-; IR-NEXT: %19 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %15)
-; IR-NEXT: %20 = extractvalue { i1, i64 } %19, 0
-; IR-NEXT: %21 = extractvalue { i1, i64 } %19, 1
-; IR-NEXT: br i1 %20, label %bb31.loopexit, label %Flow3
 
-; IR: bb31:
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %7)
-; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
-; IR-NEXT: ret void
-
-
-; GCN-LABEL: {{^}}nested_loop_conditions:
-
-; GCN: v_cmp_lt_i32_e32 vcc, 8, v
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN: s_cbranch_vccnz [[BB31:BB[0-9]+_[0-9]+]]
-
-; GCN: [[BB14:BB[0-9]+_[0-9]+]]: ; %bb14
-; GCN: v_cmp_ne_u32_e32 vcc, 1, v
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz [[BB31]]
-
-; GCN: [[BB18:BB[0-9]+_[0-9]+]]: ; %bb18
-; GCN: buffer_load_dword
-; GCN: v_cmp_lt_i32_e32 vcc, 8, v
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz [[BB18]]
-
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: v_cmp_gt_i32_e32 vcc, 9
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz [[BB14]]
-
-; GCN: [[BB31]]:
-; GCN: buffer_store_dword
-; GCN: s_endpgm
 define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %arg) #0 {
+; GCN-LABEL: nested_loop_conditions:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 8, v0
+; GCN-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-NEXT:    s_cbranch_vccnz BB1_5
+; GCN-NEXT:  ; %bb.1: ; %bb14.lr.ph
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT:  BB1_2: ; %bb14
+; GCN-NEXT:    ; =>This Loop Header: Depth=1
+; GCN-NEXT:    ; Child Loop BB1_3 Depth 2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-NEXT:    s_cbranch_vccnz BB1_5
+; GCN-NEXT:  BB1_3: ; %bb18
+; GCN-NEXT:    ; Parent Loop BB1_2 Depth=1
+; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 8, v0
+; GCN-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-NEXT:    s_cbranch_vccnz BB1_3
+; GCN-NEXT:  ; %bb.4: ; %bb21
+; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 9, v1
+; GCN-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-NEXT:    s_cbranch_vccnz BB1_2
+; GCN-NEXT:  BB1_5: ; %bb31
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+; IR-LABEL: @nested_loop_conditions(
+; IR-NEXT:  bb:
+; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
+; IR-NEXT:    [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
+; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]]
+; IR-NEXT:    [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16
+; IR-NEXT:    [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
+; IR-NEXT:    [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
+; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
+; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
+; IR:       bb14.lr.ph:
+; IR-NEXT:    br label [[BB14:%.*]]
+; IR:       Flow3:
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP21:%.*]])
+; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP14:%.*]])
+; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; IR-NEXT:    br i1 [[TMP1]], label [[BB4_BB13_CRIT_EDGE:%.*]], label [[FLOW4:%.*]]
+; IR:       bb4.bb13_crit_edge:
+; IR-NEXT:    br label [[FLOW4]]
+; IR:       Flow4:
+; IR-NEXT:    [[TMP3:%.*]] = phi i1 [ true, [[BB4_BB13_CRIT_EDGE]] ], [ false, [[FLOW3:%.*]] ]
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; IR-NEXT:    br label [[FLOW]]
+; IR:       bb13:
+; IR-NEXT:    br label [[BB31:%.*]]
+; IR:       Flow:
+; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP3]], [[FLOW4]] ], [ true, [[BB:%.*]] ]
+; IR-NEXT:    [[TMP5:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP4]])
+; IR-NEXT:    [[TMP6:%.*]] = extractvalue { i1, i64 } [[TMP5]], 0
+; IR-NEXT:    [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP5]], 1
+; IR-NEXT:    br i1 [[TMP6]], label [[BB13:%.*]], label [[BB31]]
+; IR:       bb14:
+; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP16:%.*]], [[FLOW1:%.*]] ], [ 0, [[BB14_LR_PH]] ]
+; IR-NEXT:    [[MY_TMP1037:%.*]] = phi i32 [ [[MY_TMP1033]], [[BB14_LR_PH]] ], [ [[TMP12:%.*]], [[FLOW1]] ]
+; IR-NEXT:    [[MY_TMP936:%.*]] = phi <4 x i32> [ [[MY_TMP932]], [[BB14_LR_PH]] ], [ [[TMP11:%.*]], [[FLOW1]] ]
+; IR-NEXT:    [[MY_TMP15:%.*]] = icmp eq i32 [[MY_TMP1037]], 1
+; IR-NEXT:    [[TMP8:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP15]])
+; IR-NEXT:    [[TMP9:%.*]] = extractvalue { i1, i64 } [[TMP8]], 0
+; IR-NEXT:    [[TMP10:%.*]] = extractvalue { i1, i64 } [[TMP8]], 1
+; IR-NEXT:    br i1 [[TMP9]], label [[BB16:%.*]], label [[FLOW1]]
+; IR:       bb16:
+; IR-NEXT:    [[MY_TMP17:%.*]] = bitcast i64 [[MY_TMP3]] to <2 x i32>
+; IR-NEXT:    br label [[BB18:%.*]]
+; IR:       Flow1:
+; IR-NEXT:    [[TMP11]] = phi <4 x i32> [ [[MY_TMP9:%.*]], [[BB21:%.*]] ], [ undef, [[BB14]] ]
+; IR-NEXT:    [[TMP12]] = phi i32 [ [[MY_TMP10:%.*]], [[BB21]] ], [ undef, [[BB14]] ]
+; IR-NEXT:    [[TMP13:%.*]] = phi i1 [ [[TMP18:%.*]], [[BB21]] ], [ true, [[BB14]] ]
+; IR-NEXT:    [[TMP14]] = phi i1 [ [[TMP18]], [[BB21]] ], [ false, [[BB14]] ]
+; IR-NEXT:    [[TMP15:%.*]] = phi i1 [ false, [[BB21]] ], [ true, [[BB14]] ]
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]])
+; IR-NEXT:    [[TMP16]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP13]], i64 [[PHI_BROKEN]])
+; IR-NEXT:    [[TMP17:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP16]])
+; IR-NEXT:    br i1 [[TMP17]], label [[FLOW2:%.*]], label [[BB14]]
+; IR:       bb18:
+; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP20:%.*]] = icmp slt i32 [[MY_TMP19]], 9
+; IR-NEXT:    br i1 [[MY_TMP20]], label [[BB21]], label [[BB18]]
+; IR:       bb21:
+; IR-NEXT:    [[MY_TMP22:%.*]] = extractelement <2 x i32> [[MY_TMP17]], i64 1
+; IR-NEXT:    [[MY_TMP23:%.*]] = lshr i32 [[MY_TMP22]], 16
+; IR-NEXT:    [[MY_TMP24:%.*]] = select i1 undef, i32 undef, i32 [[MY_TMP23]]
+; IR-NEXT:    [[MY_TMP25:%.*]] = uitofp i32 [[MY_TMP24]] to float
+; IR-NEXT:    [[MY_TMP26:%.*]] = fmul float [[MY_TMP25]], 0x3EF0001000000000
+; IR-NEXT:    [[MY_TMP27:%.*]] = fsub float [[MY_TMP26]], undef
+; IR-NEXT:    [[MY_TMP28:%.*]] = fcmp olt float [[MY_TMP27]], 5.000000e-01
+; IR-NEXT:    [[MY_TMP29:%.*]] = select i1 [[MY_TMP28]], i64 1, i64 2
+; IR-NEXT:    [[MY_TMP30:%.*]] = extractelement <4 x i32> [[MY_TMP936]], i64 [[MY_TMP29]]
+; IR-NEXT:    [[MY_TMP7:%.*]] = zext i32 [[MY_TMP30]] to i64
+; IR-NEXT:    [[MY_TMP8:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 [[MY_TMP7]]
+; IR-NEXT:    [[MY_TMP9]] = load <4 x i32>, <4 x i32> addrspace(1)* [[MY_TMP8]], align 16
+; IR-NEXT:    [[MY_TMP10]] = extractelement <4 x i32> [[MY_TMP9]], i64 0
+; IR-NEXT:    [[MY_TMP11:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP12:%.*]] = icmp slt i32 [[MY_TMP11]], 9
+; IR-NEXT:    [[TMP18]] = xor i1 [[MY_TMP12]], true
+; IR-NEXT:    br label [[FLOW1]]
+; IR:       Flow2:
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP16]])
+; IR-NEXT:    [[TMP19:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
+; IR-NEXT:    [[TMP20:%.*]] = extractvalue { i1, i64 } [[TMP19]], 0
+; IR-NEXT:    [[TMP21]] = extractvalue { i1, i64 } [[TMP19]], 1
+; IR-NEXT:    br i1 [[TMP20]], label [[BB31_LOOPEXIT:%.*]], label [[FLOW3]]
+; IR:       bb31.loopexit:
+; IR-NEXT:    br label [[FLOW3]]
+; IR:       bb31:
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
+; IR-NEXT:    store volatile i32 0, i32 addrspace(1)* undef
+; IR-NEXT:    ret void
+;
 bb:
-  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load i64, i64 addrspace(1)* %tmp2, align 16
-  %tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
-  %tmp1033 = extractelement <4 x i32> %tmp932, i64 0
-  %tmp1134 = load volatile i32, i32 addrspace(1)* undef
-  %tmp1235 = icmp slt i32 %tmp1134, 9
-  br i1 %tmp1235, label %bb14.lr.ph, label %bb13
+  %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %my.tmp1 = zext i32 %my.tmp to i64
+  %my.tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %my.tmp1
+  %my.tmp3 = load i64, i64 addrspace(1)* %my.tmp2, align 16
+  %my.tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
+  %my.tmp1033 = extractelement <4 x i32> %my.tmp932, i64 0
+  %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
+  %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
+  br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13
 
 bb14.lr.ph:                                       ; preds = %bb
   br label %bb14
@@ -221,37 +287,37 @@ bb13:                                             ; preds = %bb4.bb13_crit_edge,
   br label %bb31
 
 bb14:                                             ; preds = %bb21, %bb14.lr.ph
-  %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %tmp10, %bb21 ]
-  %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %tmp9, %bb21 ]
-  %tmp15 = icmp eq i32 %tmp1037, 1
-  br i1 %tmp15, label %bb16, label %bb31.loopexit
+  %my.tmp1037 = phi i32 [ %my.tmp1033, %bb14.lr.ph ], [ %my.tmp10, %bb21 ]
+  %my.tmp936 = phi <4 x i32> [ %my.tmp932, %bb14.lr.ph ], [ %my.tmp9, %bb21 ]
+  %my.tmp15 = icmp eq i32 %my.tmp1037, 1
+  br i1 %my.tmp15, label %bb16, label %bb31.loopexit
 
 bb16:                                             ; preds = %bb14
-  %tmp17 = bitcast i64 %tmp3 to <2 x i32>
+  %my.tmp17 = bitcast i64 %my.tmp3 to <2 x i32>
   br label %bb18
 
 bb18:                                             ; preds = %bb18, %bb16
-  %tmp19 = load volatile i32, i32 addrspace(1)* undef
-  %tmp20 = icmp slt i32 %tmp19, 9
-  br i1 %tmp20, label %bb21, label %bb18
+  %my.tmp19 = load volatile i32, i32 addrspace(1)* undef
+  %my.tmp20 = icmp slt i32 %my.tmp19, 9
+  br i1 %my.tmp20, label %bb21, label %bb18
 
 bb21:                                             ; preds = %bb18
-  %tmp22 = extractelement <2 x i32> %tmp17, i64 1
-  %tmp23 = lshr i32 %tmp22, 16
-  %tmp24 = select i1 undef, i32 undef, i32 %tmp23
-  %tmp25 = uitofp i32 %tmp24 to float
-  %tmp26 = fmul float %tmp25, 0x3EF0001000000000
-  %tmp27 = fsub float %tmp26, undef
-  %tmp28 = fcmp olt float %tmp27, 5.000000e-01
-  %tmp29 = select i1 %tmp28, i64 1, i64 2
-  %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
-  %tmp7 = zext i32 %tmp30 to i64
-  %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
-  %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
-  %tmp10 = extractelement <4 x i32> %tmp9, i64 0
-  %tmp11 = load volatile i32, i32 addrspace(1)* undef
-  %tmp12 = icmp slt i32 %tmp11, 9
-  br i1 %tmp12, label %bb14, label %bb4.bb13_crit_edge
+  %my.tmp22 = extractelement <2 x i32> %my.tmp17, i64 1
+  %my.tmp23 = lshr i32 %my.tmp22, 16
+  %my.tmp24 = select i1 undef, i32 undef, i32 %my.tmp23
+  %my.tmp25 = uitofp i32 %my.tmp24 to float
+  %my.tmp26 = fmul float %my.tmp25, 0x3EF0001000000000
+  %my.tmp27 = fsub float %my.tmp26, undef
+  %my.tmp28 = fcmp olt float %my.tmp27, 5.000000e-01
+  %my.tmp29 = select i1 %my.tmp28, i64 1, i64 2
+  %my.tmp30 = extractelement <4 x i32> %my.tmp936, i64 %my.tmp29
+  %my.tmp7 = zext i32 %my.tmp30 to i64
+  %my.tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %my.tmp7
+  %my.tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %my.tmp8, align 16
+  %my.tmp10 = extractelement <4 x i32> %my.tmp9, i64 0
+  %my.tmp11 = load volatile i32, i32 addrspace(1)* undef
+  %my.tmp12 = icmp slt i32 %my.tmp11, 9
+  br i1 %my.tmp12, label %bb14, label %bb4.bb13_crit_edge
 
 bb31.loopexit:                                    ; preds = %bb14
   br label %bb31
diff --git a/llvm/test/CodeGen/AMDGPU/postra-machine-sink.mir b/llvm/test/CodeGen/AMDGPU/postra-machine-sink.mir
index b034cae9926082..c77d6e0eb1be12 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-machine-sink.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-machine-sink.mir
@@ -5,7 +5,7 @@
 # CHECK-LABEL: bb.0:
 # CHECK: renamable $sgpr1 = COPY renamable $sgpr2
 # CHECK-LABEL: bb.1:
-# CHECK: liveins: $sgpr0_sgpr1:0x00000003
+# CHECK: liveins: $sgpr0_sgpr1:0x0000000000000003
 # CHECK: renamable $vgpr1_vgpr2 = COPY renamable $sgpr0_sgpr1
 
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 23bb18e738f54b..faf6ca4cbcb288 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -1,16 +1,55 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
-
-; SI: [[LOOP_LABEL:[A-Z0-9]+]]:
-; Lowered break instructin:
-; SI: s_or_b64
-; Lowered Loop instruction:
-; SI: s_andn2_b64
-; s_cbranch_execnz [[LOOP_LABEL]]
-; SI: s_endpgm
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=FLAT %s
+
 define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
+; SI-LABEL: break_inserted_outside_of_loop:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; SI-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, s0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    s_mov_b64 s[0:1], 0
+; SI-NEXT:  BB0_1: ; %ENDIF
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_and_b64 s[2:3], exec, vcc
+; SI-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; SI-NEXT:    s_cbranch_execnz BB0_1
+; SI-NEXT:  ; %bb.2: ; %ENDLOOP
+; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; FLAT-LABEL: break_inserted_outside_of_loop:
+; FLAT:       ; %bb.0: ; %main_body
+; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; FLAT-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
+; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
+; FLAT-NEXT:    v_and_b32_e32 v0, s0, v0
+; FLAT-NEXT:    v_and_b32_e32 v0, 1, v0
+; FLAT-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; FLAT-NEXT:    s_mov_b64 s[0:1], 0
+; FLAT-NEXT:  BB0_1: ; %ENDIF
+; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
+; FLAT-NEXT:    s_and_b64 s[2:3], exec, vcc
+; FLAT-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; FLAT-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; FLAT-NEXT:    s_cbranch_execnz BB0_1
+; FLAT-NEXT:  ; %bb.2: ; %ENDLOOP
+; FLAT-NEXT:    s_or_b64 exec, exec, s[0:1]
+; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    v_mov_b32_e32 v0, 0
+; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; FLAT-NEXT:    s_endpgm
 main_body:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = and i32 %a, %tid
@@ -25,25 +64,54 @@ ENDIF:
   br i1 %1, label %ENDLOOP, label %ENDIF
 }
 
-
-; FUNC-LABEL: {{^}}phi_cond_outside_loop:
-
-; SI:     s_mov_b64         [[LEFT:s\[[0-9]+:[0-9]+\]]], 0
-; SI:     s_mov_b64         [[PHI:s\[[0-9]+:[0-9]+\]]], 0
-
-; SI: ; %else
-; SI:     v_cmp_eq_u32_e64  [[TMP:s\[[0-9]+:[0-9]+\]]],
-
-; SI: ; %endif
-
-; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop
-; SI:     s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]]
-; SI:     s_or_b64          [[LEFT]], [[TMP1]], [[LEFT]]
-; SI:     s_andn2_b64       exec, exec, [[LEFT]]
-; SI:     s_cbranch_execnz  [[LOOP_LABEL]]
-; SI:     s_endpgm
-
 define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
+; SI-LABEL: phi_cond_outside_loop:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SI-NEXT:    s_cbranch_execz BB1_2
+; SI-NEXT:  ; %bb.1: ; %else
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
+; SI-NEXT:    s_and_b64 s[4:5], s[0:1], exec
+; SI-NEXT:  BB1_2: ; %endif
+; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SI-NEXT:  BB1_3: ; %loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
+; SI-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_cbranch_execnz BB1_3
+; SI-NEXT:  ; %bb.4: ; %exit
+; SI-NEXT:    s_endpgm
+;
+; FLAT-LABEL: phi_cond_outside_loop:
+; FLAT:       ; %bb.0: ; %entry
+; FLAT-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
+; FLAT-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; FLAT-NEXT:    s_mov_b64 s[2:3], 0
+; FLAT-NEXT:    s_mov_b64 s[4:5], 0
+; FLAT-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; FLAT-NEXT:    s_cbranch_execz BB1_2
+; FLAT-NEXT:  ; %bb.1: ; %else
+; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x24
+; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
+; FLAT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
+; FLAT-NEXT:    s_and_b64 s[4:5], s[0:1], exec
+; FLAT-NEXT:  BB1_2: ; %endif
+; FLAT-NEXT:    s_or_b64 exec, exec, s[6:7]
+; FLAT-NEXT:  BB1_3: ; %loop
+; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
+; FLAT-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
+; FLAT-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; FLAT-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; FLAT-NEXT:    s_cbranch_execnz BB1_3
+; FLAT-NEXT:  ; %bb.4: ; %exit
+; FLAT-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = icmp eq i32 %tid , 0
@@ -67,11 +135,12 @@ exit:
   ret void
 }
 
-; FIXME: should emit s_endpgm
-; CHECK-LABEL: {{^}}switch_unreachable:
-; CHECK-NOT: s_endpgm
-; CHECK: .Lfunc_end2
 define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+; SI-LABEL: switch_unreachable:
+; SI:       ; %bb.0: ; %centry
+;
+; FLAT-LABEL: switch_unreachable:
+; FLAT:       ; %bb.0: ; %centry
 centry:
   switch i32 %x, label %sw.default [
     i32 0, label %sw.bb
@@ -90,29 +159,99 @@ sw.epilog:
 
 declare float @llvm.fabs.f32(float) nounwind readnone
 
-; This broke the old AMDIL cfg structurizer
-; FUNC-LABEL: {{^}}loop_land_info_assert:
-; SI:      v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
-; SI:      s_and_b64        [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
-
-; SI: [[WHILELOOP:BB[0-9]+_[0-9]+]]: ; %while.cond
-; SI:      s_cbranch_vccz [[FOR_COND_PH:BB[0-9]+_[0-9]+]]
-
-; SI:      [[CONVEX_EXIT:BB[0-9_]+]]
-; SI:      s_mov_b64        vcc,
-; SI-NEXT: s_cbranch_vccnz  [[ENDPGM:BB[0-9]+_[0-9]+]]
-
-; SI:      s_cbranch_vccnz  [[WHILELOOP]]
-
-; SI: ; %if.else
-; SI:      buffer_store_dword
-
-; SI: [[FOR_COND_PH]]: ; %for.cond.preheader
-; SI:      s_cbranch_vccz [[ENDPGM]]
-
-; SI:      [[ENDPGM]]:
-; SI-NEXT: s_endpgm
 define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
+; SI-LABEL: loop_land_info_assert:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xc
+; SI-NEXT:    s_brev_b32 s5, 44
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s2, 0
+; SI-NEXT:    v_cmp_lt_i32_e64 s[2:3], s3, 4
+; SI-NEXT:    s_or_b64 s[8:9], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 s[0:1], exec, s[2:3]
+; SI-NEXT:    s_and_b64 s[2:3], exec, s[8:9]
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_lt_f32_e64 s[8:9], |v0|, s5
+; SI-NEXT:    v_mov_b32_e32 v0, 3
+; SI-NEXT:  BB3_1: ; %while.cond
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_mov_b64 vcc, s[0:1]
+; SI-NEXT:    s_cbranch_vccz BB3_5
+; SI-NEXT:  ; %bb.2: ; %convex.exit
+; SI-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; SI-NEXT:    s_mov_b64 vcc, s[2:3]
+; SI-NEXT:    s_cbranch_vccnz BB3_8
+; SI-NEXT:  ; %bb.3: ; %if.end
+; SI-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; SI-NEXT:    s_cbranch_vccnz BB3_1
+; SI-NEXT:  ; %bb.4: ; %if.else
+; SI-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_branch BB3_1
+; SI-NEXT:  BB3_5: ; %for.cond.preheader
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e8
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; SI-NEXT:    s_and_b64 vcc, exec, vcc
+; SI-NEXT:    s_cbranch_vccz BB3_8
+; SI-NEXT:  ; %bb.6: ; %for.body
+; SI-NEXT:    s_and_b64 vcc, exec, -1
+; SI-NEXT:  BB3_7: ; %self.loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_cbranch_vccnz BB3_7
+; SI-NEXT:  BB3_8: ; %DummyReturnBlock
+; SI-NEXT:    s_endpgm
+;
+; FLAT-LABEL: loop_land_info_assert:
+; FLAT:       ; %bb.0: ; %entry
+; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; FLAT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dword s4, s[0:1], 0x30
+; FLAT-NEXT:    s_brev_b32 s5, 44
+; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
+; FLAT-NEXT:    v_cmp_gt_i32_e64 s[0:1], s2, 0
+; FLAT-NEXT:    v_cmp_lt_i32_e64 s[2:3], s3, 4
+; FLAT-NEXT:    s_or_b64 s[8:9], s[0:1], s[2:3]
+; FLAT-NEXT:    s_and_b64 s[0:1], exec, s[2:3]
+; FLAT-NEXT:    s_and_b64 s[2:3], exec, s[8:9]
+; FLAT-NEXT:    s_waitcnt vmcnt(0)
+; FLAT-NEXT:    v_cmp_lt_f32_e64 s[8:9], |v0|, s5
+; FLAT-NEXT:    v_mov_b32_e32 v0, 3
+; FLAT-NEXT:  BB3_1: ; %while.cond
+; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
+; FLAT-NEXT:    s_mov_b64 vcc, s[0:1]
+; FLAT-NEXT:    s_cbranch_vccz BB3_5
+; FLAT-NEXT:  ; %bb.2: ; %convex.exit
+; FLAT-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; FLAT-NEXT:    s_mov_b64 vcc, s[2:3]
+; FLAT-NEXT:    s_cbranch_vccnz BB3_8
+; FLAT-NEXT:  ; %bb.3: ; %if.end
+; FLAT-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; FLAT-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; FLAT-NEXT:    s_cbranch_vccnz BB3_1
+; FLAT-NEXT:  ; %bb.4: ; %if.else
+; FLAT-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; FLAT-NEXT:    s_branch BB3_1
+; FLAT-NEXT:  BB3_5: ; %for.cond.preheader
+; FLAT-NEXT:    v_mov_b32_e32 v0, 0x3e8
+; FLAT-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; FLAT-NEXT:    s_and_b64 vcc, exec, vcc
+; FLAT-NEXT:    s_cbranch_vccz BB3_8
+; FLAT-NEXT:  ; %bb.6: ; %for.body
+; FLAT-NEXT:    s_and_b64 vcc, exec, -1
+; FLAT-NEXT:  BB3_7: ; %self.loop
+; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
+; FLAT-NEXT:    s_cbranch_vccnz BB3_7
+; FLAT-NEXT:  BB3_8: ; %DummyReturnBlock
+; FLAT-NEXT:    s_endpgm
 entry:
   %cmp = icmp sgt i32 %c0, 0
   br label %while.cond.outer
diff --git a/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll b/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll
new file mode 100644
index 00000000000000..11d71f7fe2efaa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; This testcase was discovered in si-annotate-cf.ll, where none of the
+; RUN lines was actually exercising it. See that files git log for its
+; history.
+
+; FIXME: should emit s_endpgm
+; CHECK-LABEL: {{^}}switch_unreachable:
+; CHECK-NOT: s_endpgm
+; CHECK: .Lfunc_end
+define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+centry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 60, label %sw.bb
+  ]
+
+sw.bb:
+  unreachable
+
+sw.default:
+  unreachable
+
+sw.epilog:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AVR/PR37143.ll b/llvm/test/CodeGen/AVR/PR37143.ll
index 72f4a2fd3722c3..c7cabd3cd0875d 100644
--- a/llvm/test/CodeGen/AVR/PR37143.ll
+++ b/llvm/test/CodeGen/AVR/PR37143.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mattr=avr6,sram < %s -march=avr | FileCheck %s
+; RUN: llc -mattr=avr6,sram < %s -march=avr -verify-machineinstrs | FileCheck %s
 
 ; CHECK: ld {{r[0-9]+}}, [[PTR:[XYZ]]]
 ; CHECK: ldd {{r[0-9]+}}, [[PTR]]+1
diff --git a/llvm/test/CodeGen/AVR/brind.ll b/llvm/test/CodeGen/AVR/brind.ll
index ec8262e84a952d..4eea966062db79 100644
--- a/llvm/test/CodeGen/AVR/brind.ll
+++ b/llvm/test/CodeGen/AVR/brind.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mattr=sram,eijmpcall < %s -march=avr | FileCheck %s
+; RUN: llc -mattr=sram,eijmpcall < %s -march=avr -verify-machineinstrs | FileCheck %s
 
 @brind.k = private unnamed_addr constant [2 x i8*] [i8* blockaddress(@brind, %return), i8* blockaddress(@brind, %b)], align 1
 
diff --git a/llvm/test/CodeGen/AVR/load.ll b/llvm/test/CodeGen/AVR/load.ll
index dbadacfd5e0de1..53748b3b100b92 100644
--- a/llvm/test/CodeGen/AVR/load.ll
+++ b/llvm/test/CodeGen/AVR/load.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mattr=avr6,sram < %s -march=avr | FileCheck %s
+; RUN: llc -mattr=avr6,sram < %s -march=avr -verify-machineinstrs | FileCheck %s
 
 define i8 @load8(i8* %x) {
 ; CHECK-LABEL: load8:
diff --git a/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr.mir b/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr.mir
index 5bd4bf2d431c8b..2343d0df49274a 100644
--- a/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr.mir
+++ b/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr.mir
@@ -17,7 +17,7 @@ body: |
 
     ; CHECK-LABEL: test_ldwrdptr
 
-    ; CHECK:      $r0, $r31r30 = LDRdPtr
+    ; CHECK:               $r0 = LDRdPtr $r31r30
     ; CHECK-NEXT:          $r1 = LDDRdPtrQ $r31r30, 1
 
     $r1r0 = LDWRdPtr $r31r30
diff --git a/llvm/test/CodeGen/Hexagon/addrmode-align.ll b/llvm/test/CodeGen/Hexagon/addrmode-align.ll
index 1a4df00d47cbdc..f39019a0b40ef5 100644
--- a/llvm/test/CodeGen/Hexagon/addrmode-align.ll
+++ b/llvm/test/CodeGen/Hexagon/addrmode-align.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; CHECK: [[REG0:(r[0-9]+)]] = add(r29
-; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#4)
+; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#8)
 ; CHECK-DAG: memd([[REG1]]+#8) =
 ; CHECK-DAG: memd([[REG1]]+#0) =
 
diff --git a/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll b/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll
new file mode 100644
index 00000000000000..8fbf913a22cbb0
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll
@@ -0,0 +1,50 @@
+; RUN: llc -O3 -march=hexagon < %s | FileCheck %s
+; Test to ensure LSR does not optimize out addrec of the outerloop.
+; This will help to generate post-increment instructions, otherwise
+; it end up an as extra reg+reg add inside the loop.
+; CHECK:  loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: memuh{{.*}}++
+; CHECK: endloop
+
+
+define dso_local signext i16 @foo(i16* nocapture readonly %filt, i16* nocapture readonly %inp, i32 %c1, i32 %c2) local_unnamed_addr {
+entry:
+  %cmp28 = icmp sgt i32 %c1, 0
+  %cmp221 = icmp sgt i32 %c2, 0
+  %or.cond = and i1 %cmp28, %cmp221
+  br i1 %or.cond, label %for.cond1.preheader.us, label %for.cond.cleanup
+
+for.cond1.preheader.us:                           ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge.us
+  %filt.addr.032.us = phi i16* [ %scevgep, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %filt, %entry ]
+  %inp.addr.031.us = phi i16* [ %scevgep35, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %inp, %entry ]
+  %l.030.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
+  %sum0.029.us = phi i16 [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
+  %scevgep = getelementptr i16, i16* %filt.addr.032.us, i32 %c2
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
+  %z.025.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %filt.addr.124.us = phi i16* [ %filt.addr.032.us, %for.cond1.preheader.us ], [ %incdec.ptr.us, %for.body4.us ]
+  %inp.addr.123.us = phi i16* [ %inp.addr.031.us, %for.cond1.preheader.us ], [ %incdec.ptr5.us, %for.body4.us ]
+  %sum0.122.us = phi i16 [ %sum0.029.us, %for.cond1.preheader.us ], [ %add8.us, %for.body4.us ]
+  %incdec.ptr.us = getelementptr inbounds i16, i16* %filt.addr.124.us, i32 1
+  %0 = load i16, i16* %filt.addr.124.us, align 2
+  %incdec.ptr5.us = getelementptr inbounds i16, i16* %inp.addr.123.us, i32 1
+  %1 = load i16, i16* %inp.addr.123.us, align 2
+  %add.us = add i16 %0, %sum0.122.us
+  %add8.us = add i16 %add.us, %1
+  %inc.us = add nuw nsw i32 %z.025.us, 1
+  %exitcond = icmp eq i32 %inc.us, %c2
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %scevgep35 = getelementptr i16, i16* %inp.addr.031.us, i32 %c2
+  %inc11.us = add nuw nsw i32 %l.030.us, 1
+  %exitcond36 = icmp eq i32 %inc11.us, %c1
+  br i1 %exitcond36, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
+  %sum0.0.lcssa = phi i16 [ 0, %entry ], [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ]
+  ret i16 %sum0.0.lcssa
+}
diff --git a/llvm/test/CodeGen/Hexagon/verify-liveness-at-def.mir b/llvm/test/CodeGen/Hexagon/verify-liveness-at-def.mir
index fefe245140990b..d57325e5b27db8 100644
--- a/llvm/test/CodeGen/Hexagon/verify-liveness-at-def.mir
+++ b/llvm/test/CodeGen/Hexagon/verify-liveness-at-def.mir
@@ -40,21 +40,21 @@ body: |
 # CHECK-SUB: Bad machine code: Live range continues after dead def flag
 # CHECK_SUB-NEXT: function:    test_fail
 # CHECK-SUB:      v. register: %0
-# CHECK-SUB:      lanemask:    00000002
+# CHECK-SUB:      lanemask:    0000000000000002
 #
 # CHECK-SUB-NOT: Bad machine code
 #
 # CHECK-SUB: Bad machine code: Live range continues after dead def flag
 # CHECK-SUB-NEXT: function:    test_fail
 # CHECK-SUB:      v. register: %1
-# CHECK-SUB:      lanemask:    00000002
+# CHECK-SUB:      lanemask:    0000000000000002
 #
 # CHECK-SUB-NOT: Bad machine code
 #
 # CHECK-SUB: Bad machine code: Live range continues after dead def flag
 # CHECK-SUB-NEXT: function:    test_fail
 # CHECK-SUB:      v. register: %1
-# CHECK-SUB:      lanemask:    00000001
+# CHECK-SUB:      lanemask:    0000000000000001
 #
 # CHECK-SUB: Bad machine code: Live range continues after dead def flag
 # CHECK-SUB-NEXT: function:    test_fail
diff --git a/llvm/test/CodeGen/MIR/Hexagon/parse-lane-masks.mir b/llvm/test/CodeGen/MIR/Hexagon/parse-lane-masks.mir
index 1b6dc3b4c41bfa..915c354b5a0ff7 100644
--- a/llvm/test/CodeGen/MIR/Hexagon/parse-lane-masks.mir
+++ b/llvm/test/CodeGen/MIR/Hexagon/parse-lane-masks.mir
@@ -3,7 +3,7 @@
 
 # CHECK-LABEL: name: foo
 # CHECK: bb.0:
-# CHECK: liveins: $d0:0x00000002, $d1, $d2:0x00000010
+# CHECK: liveins: $d0:0x0000000000000002, $d1, $d2:0x0000000000000010
 
 --- |
   define void @foo() {
diff --git a/llvm/test/CodeGen/SystemZ/codegenprepare-form-OF-ops.ll b/llvm/test/CodeGen/SystemZ/codegenprepare-form-OF-ops.ll
new file mode 100644
index 00000000000000..161f4bc2b7658d
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/codegenprepare-form-OF-ops.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -O3 | FileCheck %s
+;
+; Check that CodeGenPrepare transforms these functions to use
+; uadd.with.overflow / usub.with.overflow intrinsics so that the compare
+; instruction is eliminated.
+
+define i32 @uaddo_32(i32 %arg)  {
+; CHECK-LABEL: uaddo_32:
+; CHECK: alhsik	 %r0, %r2, -1
+; CHECK: locrnle %r2, %r0
+; CHECK: br      %r14
+
+bb:
+  %tmp10 = icmp ne i32 %arg, 0
+  %tmp11 = add nsw i32 %arg, -1
+  %tmp12 = select i1 %tmp10, i32 %tmp11, i32 %arg
+  ret i32 %tmp12
+}
+
+define i64 @uaddo_64(i64 %arg)  {
+; CHECK-LABEL: uaddo_64:
+; CHECK: alghsik  %r0, %r2, -1
+; CHECK: locgrnle %r2, %r0
+; CHECK: br       %r14
+bb:
+  %tmp10 = icmp ne i64 %arg, 0
+  %tmp11 = add nsw i64 %arg, -1
+  %tmp12 = select i1 %tmp10, i64 %tmp11, i64 %arg
+  ret i64 %tmp12
+}
+
+define i32 @usubo_32(i32 %arg)  {
+; CHECK-LABEL: usubo_32:
+; CHECK: alhsik %r0, %r2, -1
+; CHECK: locrle %r2, %r0
+; CHECK: br     %r14
+bb:
+  %tmp10 = icmp eq i32 %arg, 0
+  %tmp11 = sub nsw i32 %arg, 1
+  %tmp12 = select i1 %tmp10, i32 %tmp11, i32 %arg
+  ret i32 %tmp12
+}
+
+define i64 @usubo_64(i64 %arg)  {
+; CHECK-LABEL: usubo_64:
+; CHECK: alghsik %r0, %r2, -1
+; CHECK: locgrle %r2, %r0
+; CHECK: br      %r14
+bb:
+  %tmp10 = icmp eq i64 %arg, 0
+  %tmp11 = sub nsw i64 %arg, 1
+  %tmp12 = select i1 %tmp10, i64 %tmp11, i64 %arg
+  ret i64 %tmp12
+}
diff --git a/llvm/test/CodeGen/SystemZ/dag-combine-05.ll b/llvm/test/CodeGen/SystemZ/dag-combine-05.ll
index 78b129fc2f731c..eb9fcc29692108 100644
--- a/llvm/test/CodeGen/SystemZ/dag-combine-05.ll
+++ b/llvm/test/CodeGen/SystemZ/dag-combine-05.ll
@@ -26,10 +26,13 @@ bb:
   %tmp = icmp ult i16 %arg0, 9616
   %tmp1 = zext i1 %tmp to i32
   %tmp2 = load i16, i16* %src
-  %tmp3 = add i16 %tmp2, -1
-  %tmp4 = icmp ne i16 %tmp2, 0
-  %tmp5 = zext i1 %tmp4 to i32
+  %0 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %tmp2, i16 -1)
+  %math = extractvalue { i16, i1 } %0, 0
+  %ov = extractvalue { i16, i1 } %0, 1
+  %tmp5 = zext i1 %ov to i32
   %tmp6 = add nuw nsw i32 %tmp5, %tmp1
   store i32 %tmp6, i32* %dst
   ret void
 }
+
+declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1
diff --git a/llvm/test/CodeGen/SystemZ/frame-25.ll b/llvm/test/CodeGen/SystemZ/frame-25.ll
new file mode 100644
index 00000000000000..64c175bd4ecaa2
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/frame-25.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+;
+; Test that space is allocated for the incoming back chain also in cases
+; where no GPRs are saved / restored.
+
+define void @fun0() #0 {
+; CHECK-LABEL: fun0:
+; CHECK: lgr     %r1, %r15
+; CHECK-NEXT: aghi    %r15, -24
+; CHECK-NEXT: stg     %r1, 152(%r15)
+; CHECK-NEXT: #APP
+; CHECK-NEXT: stcke   160(%r15)
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: aghi    %r15, 24
+; CHECK-NEXT: br      %r14
+
+entry:
+  %b = alloca [16 x i8], align 1
+  %0 = getelementptr inbounds [16 x i8], [16 x i8]* %b, i64 0, i64 0
+  call void asm "stcke $0", "=*Q"([16 x i8]* nonnull %b) #2
+  ret void
+}
+
+attributes #0 = { nounwind "packed-stack" "backchain" "use-soft-float"="true" }
diff --git a/llvm/test/CodeGen/Thumb/remove-unneeded-push-pop.ll b/llvm/test/CodeGen/Thumb/remove-unneeded-push-pop.ll
deleted file mode 100644
index 054be2ea858721..00000000000000
--- a/llvm/test/CodeGen/Thumb/remove-unneeded-push-pop.ll
+++ /dev/null
@@ -1,1052 +0,0 @@
-; RUN: llc -O0 -mtriple thumbv6m-arm-none-eabi < %s | FileCheck %s
-
-@a = external hidden global i32*, align 4
-@f = external hidden global i32, align 4
-
-define hidden void @foo() {
-entry:
-; CHECK-NOT: push	{lr}
-; CHECK-NOT: pop	{pc}
-  store i32 24654, i32* @f, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %entry
-  %0 = load i32*, i32** @a, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %0, i32 2
-  %1 = load i32, i32* %arrayidx1, align 4
-  %tobool2 = icmp ne i32 %1, 0
-  br i1 %tobool2, label %if.then3, label %if.end4
-
-if.then3:                                         ; preds = %if.end
-  store i32 17785, i32* @f, align 4
-  br label %if.end4
-
-if.end4:                                          ; preds = %if.then3, %if.end
-  %2 = load i32*, i32** @a, align 4
-  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 3
-  %3 = load i32, i32* %arrayidx5, align 4
-  %tobool6 = icmp ne i32 %3, 0
-  br i1 %tobool6, label %if.then7, label %if.end8
-
-if.then7:                                         ; preds = %if.end4
-  store i32 10342, i32* @f, align 4
-  br label %if.end8
-
-if.end8:                                          ; preds = %if.then7, %if.end4
-  %4 = load i32*, i32** @a, align 4
-  %arrayidx9 = getelementptr inbounds i32, i32* %4, i32 4
-  %5 = load i32, i32* %arrayidx9, align 4
-  %tobool10 = icmp ne i32 %5, 0
-  br i1 %tobool10, label %if.then11, label %if.end12
-
-if.then11:                                        ; preds = %if.end8
-  store i32 29082, i32* @f, align 4
-  br label %if.end12
-
-if.end12:                                         ; preds = %if.then11, %if.end8
-  %6 = load i32*, i32** @a, align 4
-  %arrayidx13 = getelementptr inbounds i32, i32* %6, i32 5
-  %7 = load i32, i32* %arrayidx13, align 4
-  %tobool14 = icmp ne i32 %7, 0
-  br i1 %tobool14, label %if.then15, label %if.end16
-
-if.then15:                                        ; preds = %if.end12
-  store i32 29893, i32* @f, align 4
-  br label %if.end16
-
-if.end16:                                         ; preds = %if.then15, %if.end12
-  %8 = load i32*, i32** @a, align 4
-  %arrayidx17 = getelementptr inbounds i32, i32* %8, i32 6
-  %9 = load i32, i32* %arrayidx17, align 4
-  %tobool18 = icmp ne i32 %9, 0
-  br i1 %tobool18, label %if.then19, label %if.end20
-
-if.then19:                                        ; preds = %if.end16
-  store i32 19071, i32* @f, align 4
-  br label %if.end20
-
-if.end20:                                         ; preds = %if.then19, %if.end16
-  %10 = load i32*, i32** @a, align 4
-  %arrayidx21 = getelementptr inbounds i32, i32* %10, i32 7
-  %11 = load i32, i32* %arrayidx21, align 4
-  %tobool22 = icmp ne i32 %11, 0
-  br i1 %tobool22, label %if.then23, label %if.end24
-
-if.then23:                                        ; preds = %if.end20
-  store i32 6154, i32* @f, align 4
-  br label %if.end24
-
-if.end24:                                         ; preds = %if.then23, %if.end20
-  %12 = load i32*, i32** @a, align 4
-  %arrayidx25 = getelementptr inbounds i32, i32* %12, i32 8
-  %13 = load i32, i32* %arrayidx25, align 4
-  %tobool26 = icmp ne i32 %13, 0
-  br i1 %tobool26, label %if.then27, label %if.end28
-
-if.then27:                                        ; preds = %if.end24
-  store i32 30498, i32* @f, align 4
-  br label %if.end28
-
-if.end28:                                         ; preds = %if.then27, %if.end24
-  %14 = load i32*, i32** @a, align 4
-  %arrayidx29 = getelementptr inbounds i32, i32* %14, i32 9
-  %15 = load i32, i32* %arrayidx29, align 4
-  %tobool30 = icmp ne i32 %15, 0
-  br i1 %tobool30, label %if.then31, label %if.end32
-
-if.then31:                                        ; preds = %if.end28
-  store i32 16667, i32* @f, align 4
-  br label %if.end32
-
-if.end32:                                         ; preds = %if.then31, %if.end28
-  %16 = load i32*, i32** @a, align 4
-  %arrayidx33 = getelementptr inbounds i32, i32* %16, i32 10
-  %17 = load i32, i32* %arrayidx33, align 4
-  %tobool34 = icmp ne i32 %17, 0
-  br i1 %tobool34, label %if.then35, label %if.end36
-
-if.then35:                                        ; preds = %if.end32
-  store i32 195, i32* @f, align 4
-  br label %if.end36
-
-if.end36:                                         ; preds = %if.then35, %if.end32
-  %18 = load i32*, i32** @a, align 4
-  %arrayidx37 = getelementptr inbounds i32, i32* %18, i32 11
-  %19 = load i32, i32* %arrayidx37, align 4
-  %tobool38 = icmp ne i32 %19, 0
-  br i1 %tobool38, label %if.then39, label %if.end40
-
-if.then39:                                        ; preds = %if.end36
-  store i32 14665, i32* @f, align 4
-  br label %if.end40
-
-if.end40:                                         ; preds = %if.then39, %if.end36
-  %20 = load i32*, i32** @a, align 4
-  %arrayidx41 = getelementptr inbounds i32, i32* %20, i32 12
-  %21 = load i32, i32* %arrayidx41, align 4
-  %tobool42 = icmp ne i32 %21, 0
-  br i1 %tobool42, label %if.then43, label %if.end44
-
-if.then43:                                        ; preds = %if.end40
-  store i32 19305, i32* @f, align 4
-  br label %if.end44
-
-if.end44:                                         ; preds = %if.then43, %if.end40
-  %22 = load i32*, i32** @a, align 4
-  %arrayidx45 = getelementptr inbounds i32, i32* %22, i32 13
-  %23 = load i32, i32* %arrayidx45, align 4
-  %tobool46 = icmp ne i32 %23, 0
-  br i1 %tobool46, label %if.then47, label %if.end48
-
-if.then47:                                        ; preds = %if.end44
-  store i32 15133, i32* @f, align 4
-  br label %if.end48
-
-if.end48:                                         ; preds = %if.then47, %if.end44
-  %24 = load i32*, i32** @a, align 4
-  %arrayidx49 = getelementptr inbounds i32, i32* %24, i32 14
-  %25 = load i32, i32* %arrayidx49, align 4
-  %tobool50 = icmp ne i32 %25, 0
-  br i1 %tobool50, label %if.then51, label %if.end52
-
-if.then51:                                        ; preds = %if.end48
-  store i32 19173, i32* @f, align 4
-  br label %if.end52
-
-if.end52:                                         ; preds = %if.then51, %if.end48
-  br label %if.then55
-
-if.then55:                                        ; preds = %if.end52
-  store i32 14025, i32* @f, align 4
-  br label %if.end56
-
-if.end56:                                         ; preds = %if.then55
-  %26 = load i32*, i32** @a, align 4
-  %arrayidx57 = getelementptr inbounds i32, i32* %26, i32 16
-  %27 = load i32, i32* %arrayidx57, align 4
-  %tobool58 = icmp ne i32 %27, 0
-  br i1 %tobool58, label %if.then59, label %if.end60
-
-if.then59:                                        ; preds = %if.end56
-  store i32 8209, i32* @f, align 4
-  br label %if.end60
-
-if.end60:                                         ; preds = %if.then59, %if.end56
-  %28 = load i32*, i32** @a, align 4
-  %arrayidx61 = getelementptr inbounds i32, i32* %28, i32 17
-  %29 = load i32, i32* %arrayidx61, align 4
-  %tobool62 = icmp ne i32 %29, 0
-  br i1 %tobool62, label %if.then63, label %if.end64
-
-if.then63:                                        ; preds = %if.end60
-  store i32 29621, i32* @f, align 4
-  br label %if.end64
-
-if.end64:                                         ; preds = %if.then63, %if.end60
-  %30 = load i32*, i32** @a, align 4
-  %arrayidx65 = getelementptr inbounds i32, i32* %30, i32 18
-  %31 = load i32, i32* %arrayidx65, align 4
-  %tobool66 = icmp ne i32 %31, 0
-  br i1 %tobool66, label %if.then67, label %if.end68
-
-if.then67:                                        ; preds = %if.end64
-  store i32 14963, i32* @f, align 4
-  br label %if.end68
-
-if.end68:                                         ; preds = %if.then67, %if.end64
-  %32 = load i32*, i32** @a, align 4
-  %arrayidx69 = getelementptr inbounds i32, i32* %32, i32 19
-  %33 = load i32, i32* %arrayidx69, align 4
-  %tobool70 = icmp ne i32 %33, 0
-  br i1 %tobool70, label %if.then71, label %if.end72
-
-if.then71:                                        ; preds = %if.end68
-  store i32 32282, i32* @f, align 4
-  br label %if.end72
-
-if.end72:                                         ; preds = %if.then71, %if.end68
-  %34 = load i32*, i32** @a, align 4
-  %arrayidx73 = getelementptr inbounds i32, i32* %34, i32 20
-  %35 = load i32, i32* %arrayidx73, align 4
-  %tobool74 = icmp ne i32 %35, 0
-  br i1 %tobool74, label %if.then75, label %if.end76
-
-if.then75:                                        ; preds = %if.end72
-  store i32 3072, i32* @f, align 4
-  br label %if.end76
-
-if.end76:                                         ; preds = %if.then75, %if.end72
-  %36 = load i32*, i32** @a, align 4
-  %arrayidx77 = getelementptr inbounds i32, i32* %36, i32 21
-  %37 = load i32, i32* %arrayidx77, align 4
-  %tobool78 = icmp ne i32 %37, 0
-  br i1 %tobool78, label %if.then79, label %if.end80
-
-if.then79:                                        ; preds = %if.end76
-  store i32 1992, i32* @f, align 4
-  br label %if.end80
-
-if.end80:                                         ; preds = %if.then79, %if.end76
-  %38 = load i32*, i32** @a, align 4
-  %arrayidx81 = getelementptr inbounds i32, i32* %38, i32 22
-  %39 = load i32, i32* %arrayidx81, align 4
-  %tobool82 = icmp ne i32 %39, 0
-  br i1 %tobool82, label %if.then83, label %if.end84
-
-if.then83:                                        ; preds = %if.end80
-  store i32 9614, i32* @f, align 4
-  br label %if.end84
-
-if.end84:                                         ; preds = %if.then83, %if.end80
-  %40 = load i32*, i32** @a, align 4
-  %arrayidx85 = getelementptr inbounds i32, i32* %40, i32 23
-  %41 = load i32, i32* %arrayidx85, align 4
-  %tobool86 = icmp ne i32 %41, 0
-  br i1 %tobool86, label %if.then87, label %if.end88
-
-if.then87:                                        ; preds = %if.end84
-  store i32 25931, i32* @f, align 4
-  br label %if.end88
-
-if.end88:                                         ; preds = %if.then87, %if.end84
-  %42 = load i32*, i32** @a, align 4
-  %arrayidx89 = getelementptr inbounds i32, i32* %42, i32 24
-  %43 = load i32, i32* %arrayidx89, align 4
-  %tobool90 = icmp ne i32 %43, 0
-  br i1 %tobool90, label %if.then91, label %if.end92
-
-if.then91:                                        ; preds = %if.end88
-  store i32 22035, i32* @f, align 4
-  br label %if.end92
-
-if.end92:                                         ; preds = %if.then91, %if.end88
-  %44 = load i32*, i32** @a, align 4
-  %arrayidx93 = getelementptr inbounds i32, i32* %44, i32 25
-  %45 = load i32, i32* %arrayidx93, align 4
-  %tobool94 = icmp ne i32 %45, 0
-  br i1 %tobool94, label %if.then95, label %if.end96
-
-if.then95:                                        ; preds = %if.end92
-  store i32 10712, i32* @f, align 4
-  br label %if.end96
-
-if.end96:                                         ; preds = %if.then95, %if.end92
-  %46 = load i32*, i32** @a, align 4
-  %arrayidx97 = getelementptr inbounds i32, i32* %46, i32 26
-  %47 = load i32, i32* %arrayidx97, align 4
-  %tobool98 = icmp ne i32 %47, 0
-  br i1 %tobool98, label %if.then99, label %if.end100
-
-if.then99:                                        ; preds = %if.end96
-  store i32 18267, i32* @f, align 4
-  br label %if.end100
-
-if.end100:                                        ; preds = %if.then99, %if.end96
-  %48 = load i32*, i32** @a, align 4
-  %arrayidx101 = getelementptr inbounds i32, i32* %48, i32 27
-  %49 = load i32, i32* %arrayidx101, align 4
-  %tobool102 = icmp ne i32 %49, 0
-  br i1 %tobool102, label %if.then103, label %if.end104
-
-if.then103:                                       ; preds = %if.end100
-  store i32 30432, i32* @f, align 4
-  br label %if.end104
-
-if.end104:                                        ; preds = %if.then103, %if.end100
-  %50 = load i32*, i32** @a, align 4
-  %arrayidx105 = getelementptr inbounds i32, i32* %50, i32 28
-  %51 = load i32, i32* %arrayidx105, align 4
-  %tobool106 = icmp ne i32 %51, 0
-  br i1 %tobool106, label %if.then107, label %if.end108
-
-if.then107:                                       ; preds = %if.end104
-  store i32 5847, i32* @f, align 4
-  br label %if.end108
-
-if.end108:                                        ; preds = %if.then107, %if.end104
-  %52 = load i32*, i32** @a, align 4
-  %arrayidx109 = getelementptr inbounds i32, i32* %52, i32 29
-  %53 = load i32, i32* %arrayidx109, align 4
-  %tobool110 = icmp ne i32 %53, 0
-  br i1 %tobool110, label %if.then111, label %if.end112
-
-if.then111:                                       ; preds = %if.end108
-  store i32 14705, i32* @f, align 4
-  br label %if.end112
-
-if.end112:                                        ; preds = %if.then111, %if.end108
-  %54 = load i32*, i32** @a, align 4
-  %arrayidx113 = getelementptr inbounds i32, i32* %54, i32 30
-  %55 = load i32, i32* %arrayidx113, align 4
-  %tobool114 = icmp ne i32 %55, 0
-  br i1 %tobool114, label %if.then115, label %if.end116
-
-if.then115:                                       ; preds = %if.end112
-  store i32 28488, i32* @f, align 4
-  br label %if.end116
-
-if.end116:                                        ; preds = %if.then115, %if.end112
-  %56 = load i32*, i32** @a, align 4
-  %arrayidx117 = getelementptr inbounds i32, i32* %56, i32 31
-  %57 = load i32, i32* %arrayidx117, align 4
-  %tobool118 = icmp ne i32 %57, 0
-  br i1 %tobool118, label %if.then119, label %if.end120
-
-if.then119:                                       ; preds = %if.end116
-  store i32 13853, i32* @f, align 4
-  br label %if.end120
-
-if.end120:                                        ; preds = %if.then119, %if.end116
-  %58 = load i32*, i32** @a, align 4
-  %arrayidx121 = getelementptr inbounds i32, i32* %58, i32 32
-  %59 = load i32, i32* %arrayidx121, align 4
-  %tobool122 = icmp ne i32 %59, 0
-  br i1 %tobool122, label %if.then123, label %if.end124
-
-if.then123:                                       ; preds = %if.end120
-  store i32 31379, i32* @f, align 4
-  br label %if.end124
-
-if.end124:                                        ; preds = %if.then123, %if.end120
-  %60 = load i32*, i32** @a, align 4
-  %arrayidx125 = getelementptr inbounds i32, i32* %60, i32 33
-  %61 = load i32, i32* %arrayidx125, align 4
-  %tobool126 = icmp ne i32 %61, 0
-  br i1 %tobool126, label %if.then127, label %if.end128
-
-if.then127:                                       ; preds = %if.end124
-  store i32 7010, i32* @f, align 4
-  br label %if.end128
-
-if.end128:                                        ; preds = %if.then127, %if.end124
-  br label %if.then131
-
-if.then131:                                       ; preds = %if.end128
-  store i32 31840, i32* @f, align 4
-  br label %if.end132
-
-if.end132:                                        ; preds = %if.then131
-  %62 = load i32*, i32** @a, align 4
-  %arrayidx133 = getelementptr inbounds i32, i32* %62, i32 35
-  %63 = load i32, i32* %arrayidx133, align 4
-  %tobool134 = icmp ne i32 %63, 0
-  br i1 %tobool134, label %if.then135, label %if.end136
-
-if.then135:                                       ; preds = %if.end132
-  store i32 16119, i32* @f, align 4
-  br label %if.end136
-
-if.end136:                                        ; preds = %if.then135, %if.end132
-  %64 = load i32*, i32** @a, align 4
-  %arrayidx137 = getelementptr inbounds i32, i32* %64, i32 36
-  %65 = load i32, i32* %arrayidx137, align 4
-  %tobool138 = icmp ne i32 %65, 0
-  br i1 %tobool138, label %if.then139, label %if.end140
-
-if.then139:                                       ; preds = %if.end136
-  store i32 7119, i32* @f, align 4
-  br label %if.end140
-
-if.end140:                                        ; preds = %if.then139, %if.end136
-  %66 = load i32*, i32** @a, align 4
-  %arrayidx141 = getelementptr inbounds i32, i32* %66, i32 37
-  %67 = load i32, i32* %arrayidx141, align 4
-  %tobool142 = icmp ne i32 %67, 0
-  br i1 %tobool142, label %if.then143, label %if.end144
-
-if.then143:                                       ; preds = %if.end140
-  store i32 3333, i32* @f, align 4
-  br label %if.end144
-
-if.end144:                                        ; preds = %if.then143, %if.end140
-  %68 = load i32*, i32** @a, align 4
-  %arrayidx145 = getelementptr inbounds i32, i32* %68, i32 38
-  %69 = load i32, i32* %arrayidx145, align 4
-  %tobool146 = icmp ne i32 %69, 0
-  br i1 %tobool146, label %if.then147, label %if.end148
-
-if.then147:                                       ; preds = %if.end144
-  store i32 6430, i32* @f, align 4
-  br label %if.end148
-
-if.end148:                                        ; preds = %if.then147, %if.end144
-  %70 = load i32*, i32** @a, align 4
-  %arrayidx149 = getelementptr inbounds i32, i32* %70, i32 39
-  %71 = load i32, i32* %arrayidx149, align 4
-  %tobool150 = icmp ne i32 %71, 0
-  br i1 %tobool150, label %if.then151, label %if.end152
-
-if.then151:                                       ; preds = %if.end148
-  store i32 19857, i32* @f, align 4
-  br label %if.end152
-
-if.end152:                                        ; preds = %if.then151, %if.end148
-  %72 = load i32*, i32** @a, align 4
-  %arrayidx153 = getelementptr inbounds i32, i32* %72, i32 40
-  %73 = load i32, i32* %arrayidx153, align 4
-  %tobool154 = icmp ne i32 %73, 0
-  br i1 %tobool154, label %if.then155, label %if.end156
-
-if.then155:                                       ; preds = %if.end152
-  store i32 13237, i32* @f, align 4
-  br label %if.end156
-
-if.end156:                                        ; preds = %if.then155, %if.end152
-  br label %if.then159
-
-if.then159:                                       ; preds = %if.end156
-  store i32 163, i32* @f, align 4
-  br label %if.end160
-
-if.end160:                                        ; preds = %if.then159
-  %74 = load i32*, i32** @a, align 4
-  %arrayidx161 = getelementptr inbounds i32, i32* %74, i32 42
-  %75 = load i32, i32* %arrayidx161, align 4
-  %tobool162 = icmp ne i32 %75, 0
-  br i1 %tobool162, label %if.then163, label %if.end164
-
-if.then163:                                       ; preds = %if.end160
-  store i32 1961, i32* @f, align 4
-  br label %if.end164
-
-if.end164:                                        ; preds = %if.then163, %if.end160
-  %76 = load i32*, i32** @a, align 4
-  %arrayidx165 = getelementptr inbounds i32, i32* %76, i32 43
-  %77 = load i32, i32* %arrayidx165, align 4
-  %tobool166 = icmp ne i32 %77, 0
-  br i1 %tobool166, label %if.then167, label %if.end168
-
-if.then167:                                       ; preds = %if.end164
-  store i32 11325, i32* @f, align 4
-  br label %if.end168
-
-if.end168:                                        ; preds = %if.then167, %if.end164
-  %78 = load i32*, i32** @a, align 4
-  %arrayidx169 = getelementptr inbounds i32, i32* %78, i32 44
-  %79 = load i32, i32* %arrayidx169, align 4
-  %tobool170 = icmp ne i32 %79, 0
-  br i1 %tobool170, label %if.then171, label %if.end172
-
-if.then171:                                       ; preds = %if.end168
-  store i32 12189, i32* @f, align 4
-  br label %if.end172
-
-if.end172:                                        ; preds = %if.then171, %if.end168
-  %80 = load i32*, i32** @a, align 4
-  %arrayidx173 = getelementptr inbounds i32, i32* %80, i32 45
-  %81 = load i32, i32* %arrayidx173, align 4
-  %tobool174 = icmp ne i32 %81, 0
-  br i1 %tobool174, label %if.then175, label %if.end176
-
-if.then175:                                       ; preds = %if.end172
-  store i32 15172, i32* @f, align 4
-  br label %if.end176
-
-if.end176:                                        ; preds = %if.then175, %if.end172
-  br label %if.then179
-
-if.then179:                                       ; preds = %if.end176
-  store i32 13491, i32* @f, align 4
-  br label %if.end180
-
-if.end180:                                        ; preds = %if.then179
-  %82 = load i32*, i32** @a, align 4
-  %arrayidx181 = getelementptr inbounds i32, i32* %82, i32 47
-  %83 = load i32, i32* %arrayidx181, align 4
-  %tobool182 = icmp ne i32 %83, 0
-  br i1 %tobool182, label %if.then183, label %if.end184
-
-if.then183:                                       ; preds = %if.end180
-  store i32 9521, i32* @f, align 4
-  br label %if.end184
-
-if.end184:                                        ; preds = %if.then183, %if.end180
-  %84 = load i32*, i32** @a, align 4
-  %arrayidx185 = getelementptr inbounds i32, i32* %84, i32 48
-  %85 = load i32, i32* %arrayidx185, align 4
-  %tobool186 = icmp ne i32 %85, 0
-  br i1 %tobool186, label %if.then187, label %if.end188
-
-if.then187:                                       ; preds = %if.end184
-  store i32 448, i32* @f, align 4
-  br label %if.end188
-
-if.end188:                                        ; preds = %if.then187, %if.end184
-  %86 = load i32*, i32** @a, align 4
-  %arrayidx189 = getelementptr inbounds i32, i32* %86, i32 49
-  %87 = load i32, i32* %arrayidx189, align 4
-  %tobool190 = icmp ne i32 %87, 0
-  br i1 %tobool190, label %if.then191, label %if.end192
-
-if.then191:                                       ; preds = %if.end188
-  store i32 13468, i32* @f, align 4
-  br label %if.end192
-
-if.end192:                                        ; preds = %if.then191, %if.end188
-  %88 = load i32*, i32** @a, align 4
-  %arrayidx193 = getelementptr inbounds i32, i32* %88, i32 50
-  %89 = load i32, i32* %arrayidx193, align 4
-  %tobool194 = icmp ne i32 %89, 0
-  br i1 %tobool194, label %if.then195, label %if.end196
-
-if.then195:                                       ; preds = %if.end192
-  store i32 16190, i32* @f, align 4
-  br label %if.end196
-
-if.end196:                                        ; preds = %if.then195, %if.end192
-  %90 = load i32*, i32** @a, align 4
-  %arrayidx197 = getelementptr inbounds i32, i32* %90, i32 51
-  %91 = load i32, i32* %arrayidx197, align 4
-  %tobool198 = icmp ne i32 %91, 0
-  br i1 %tobool198, label %if.then199, label %if.end200
-
-if.then199:                                       ; preds = %if.end196
-  store i32 8602, i32* @f, align 4
-  br label %if.end200
-
-if.end200:                                        ; preds = %if.then199, %if.end196
-  %92 = load i32*, i32** @a, align 4
-  %arrayidx201 = getelementptr inbounds i32, i32* %92, i32 52
-  %93 = load i32, i32* %arrayidx201, align 4
-  %tobool202 = icmp ne i32 %93, 0
-  br i1 %tobool202, label %if.then203, label %if.end204
-
-if.then203:                                       ; preds = %if.end200
-  store i32 21083, i32* @f, align 4
-  br label %if.end204
-
-if.end204:                                        ; preds = %if.then203, %if.end200
-  %94 = load i32*, i32** @a, align 4
-  %arrayidx205 = getelementptr inbounds i32, i32* %94, i32 53
-  %95 = load i32, i32* %arrayidx205, align 4
-  %tobool206 = icmp ne i32 %95, 0
-  br i1 %tobool206, label %if.then207, label %if.end208
-
-if.then207:                                       ; preds = %if.end204
-  store i32 5172, i32* @f, align 4
-  br label %if.end208
-
-if.end208:                                        ; preds = %if.then207, %if.end204
-  %96 = load i32*, i32** @a, align 4
-  %arrayidx209 = getelementptr inbounds i32, i32* %96, i32 54
-  %97 = load i32, i32* %arrayidx209, align 4
-  %tobool210 = icmp ne i32 %97, 0
-  br i1 %tobool210, label %if.then211, label %if.end212
-
-if.then211:                                       ; preds = %if.end208
-  store i32 32505, i32* @f, align 4
-  br label %if.end212
-
-if.end212:                                        ; preds = %if.then211, %if.end208
-  br label %if.then215
-
-if.then215:                                       ; preds = %if.end212
-  store i32 23490, i32* @f, align 4
-  br label %if.end216
-
-if.end216:                                        ; preds = %if.then215
-  %98 = load i32*, i32** @a, align 4
-  %arrayidx217 = getelementptr inbounds i32, i32* %98, i32 56
-  %99 = load i32, i32* %arrayidx217, align 4
-  %tobool218 = icmp ne i32 %99, 0
-  br i1 %tobool218, label %if.then219, label %if.end220
-
-if.then219:                                       ; preds = %if.end216
-  store i32 30699, i32* @f, align 4
-  br label %if.end220
-
-if.end220:                                        ; preds = %if.then219, %if.end216
-  %100 = load i32*, i32** @a, align 4
-  %arrayidx221 = getelementptr inbounds i32, i32* %100, i32 57
-  %101 = load i32, i32* %arrayidx221, align 4
-  %tobool222 = icmp ne i32 %101, 0
-  br i1 %tobool222, label %if.then223, label %if.end224
-
-if.then223:                                       ; preds = %if.end220
-  store i32 16286, i32* @f, align 4
-  br label %if.end224
-
-if.end224:                                        ; preds = %if.then223, %if.end220
-  %102 = load i32*, i32** @a, align 4
-  %arrayidx225 = getelementptr inbounds i32, i32* %102, i32 58
-  %103 = load i32, i32* %arrayidx225, align 4
-  %tobool226 = icmp ne i32 %103, 0
-  br i1 %tobool226, label %if.then227, label %if.end228
-
-if.then227:                                       ; preds = %if.end224
-  store i32 17939, i32* @f, align 4
-  br label %if.end228
-
-if.end228:                                        ; preds = %if.then227, %if.end224
-  %104 = load i32*, i32** @a, align 4
-  %arrayidx229 = getelementptr inbounds i32, i32* %104, i32 59
-  %105 = load i32, i32* %arrayidx229, align 4
-  %tobool230 = icmp ne i32 %105, 0
-  br i1 %tobool230, label %if.then231, label %if.end232
-
-if.then231:                                       ; preds = %if.end228
-  store i32 25148, i32* @f, align 4
-  br label %if.end232
-
-if.end232:                                        ; preds = %if.then231, %if.end228
-  %106 = load i32*, i32** @a, align 4
-  %arrayidx233 = getelementptr inbounds i32, i32* %106, i32 60
-  %107 = load i32, i32* %arrayidx233, align 4
-  %tobool234 = icmp ne i32 %107, 0
-  br i1 %tobool234, label %if.then235, label %if.end236
-
-if.then235:                                       ; preds = %if.end232
-  store i32 644, i32* @f, align 4
-  br label %if.end236
-
-if.end236:                                        ; preds = %if.then235, %if.end232
-  br label %if.then239
-
-if.then239:                                       ; preds = %if.end236
-  store i32 23457, i32* @f, align 4
-  br label %if.end240
-
-if.end240:                                        ; preds = %if.then239
-  %108 = load i32*, i32** @a, align 4
-  %arrayidx241 = getelementptr inbounds i32, i32* %108, i32 62
-  %109 = load i32, i32* %arrayidx241, align 4
-  %tobool242 = icmp ne i32 %109, 0
-  br i1 %tobool242, label %if.then243, label %if.end244
-
-if.then243:                                       ; preds = %if.end240
-  store i32 21116, i32* @f, align 4
-  br label %if.end244
-
-if.end244:                                        ; preds = %if.then243, %if.end240
-  br label %if.then247
-
-if.then247:                                       ; preds = %if.end244
-  store i32 10066, i32* @f, align 4
-  br label %if.end248
-
-if.end248:                                        ; preds = %if.then247
-  %110 = load i32*, i32** @a, align 4
-  %arrayidx249 = getelementptr inbounds i32, i32* %110, i32 64
-  %111 = load i32, i32* %arrayidx249, align 4
-  %tobool250 = icmp ne i32 %111, 0
-  br i1 %tobool250, label %if.then251, label %if.end252
-
-if.then251:                                       ; preds = %if.end248
-  store i32 9058, i32* @f, align 4
-  br label %if.end252
-
-if.end252:                                        ; preds = %if.then251, %if.end248
-  %112 = load i32*, i32** @a, align 4
-  %arrayidx253 = getelementptr inbounds i32, i32* %112, i32 65
-  %113 = load i32, i32* %arrayidx253, align 4
-  %tobool254 = icmp ne i32 %113, 0
-  br i1 %tobool254, label %if.then255, label %if.end256
-
-if.then255:                                       ; preds = %if.end252
-  store i32 8383, i32* @f, align 4
-  br label %if.end256
-
-if.end256:                                        ; preds = %if.then255, %if.end252
-  %114 = load i32*, i32** @a, align 4
-  %arrayidx257 = getelementptr inbounds i32, i32* %114, i32 66
-  %115 = load i32, i32* %arrayidx257, align 4
-  %tobool258 = icmp ne i32 %115, 0
-  br i1 %tobool258, label %if.then259, label %if.end260
-
-if.then259:                                       ; preds = %if.end256
-  store i32 31069, i32* @f, align 4
-  br label %if.end260
-
-if.end260:                                        ; preds = %if.then259, %if.end256
-  %116 = load i32*, i32** @a, align 4
-  %arrayidx261 = getelementptr inbounds i32, i32* %116, i32 67
-  %117 = load i32, i32* %arrayidx261, align 4
-  %tobool262 = icmp ne i32 %117, 0
-  br i1 %tobool262, label %if.then263, label %if.end264
-
-if.then263:                                       ; preds = %if.end260
-  store i32 32280, i32* @f, align 4
-  br label %if.end264
-
-if.end264:                                        ; preds = %if.then263, %if.end260
-  br label %if.then267
-
-if.then267:                                       ; preds = %if.end264
-  store i32 1553, i32* @f, align 4
-  br label %if.end268
-
-if.end268:                                        ; preds = %if.then267
-  %118 = load i32*, i32** @a, align 4
-  %arrayidx269 = getelementptr inbounds i32, i32* %118, i32 69
-  %119 = load i32, i32* %arrayidx269, align 4
-  %tobool270 = icmp ne i32 %119, 0
-  br i1 %tobool270, label %if.then271, label %if.end272
-
-if.then271:                                       ; preds = %if.end268
-  store i32 8118, i32* @f, align 4
-  br label %if.end272
-
-if.end272:                                        ; preds = %if.then271, %if.end268
-  %120 = load i32*, i32** @a, align 4
-  %arrayidx273 = getelementptr inbounds i32, i32* %120, i32 70
-  %121 = load i32, i32* %arrayidx273, align 4
-  %tobool274 = icmp ne i32 %121, 0
-  br i1 %tobool274, label %if.then275, label %if.end276
-
-if.then275:                                       ; preds = %if.end272
-  store i32 12959, i32* @f, align 4
-  br label %if.end276
-
-if.end276:                                        ; preds = %if.then275, %if.end272
-  %122 = load i32*, i32** @a, align 4
-  %arrayidx277 = getelementptr inbounds i32, i32* %122, i32 71
-  %123 = load i32, i32* %arrayidx277, align 4
-  %tobool278 = icmp ne i32 %123, 0
-  br i1 %tobool278, label %if.then279, label %if.end280
-
-if.then279:                                       ; preds = %if.end276
-  store i32 675, i32* @f, align 4
-  br label %if.end280
-
-if.end280:                                        ; preds = %if.then279, %if.end276
-  %124 = load i32*, i32** @a, align 4
-  %arrayidx281 = getelementptr inbounds i32, i32* %124, i32 72
-  %125 = load i32, i32* %arrayidx281, align 4
-  %tobool282 = icmp ne i32 %125, 0
-  br i1 %tobool282, label %if.then283, label %if.end284
-
-if.then283:                                       ; preds = %if.end280
-  store i32 29144, i32* @f, align 4
-  br label %if.end284
-
-if.end284:                                        ; preds = %if.then283, %if.end280
-  %126 = load i32*, i32** @a, align 4
-  %arrayidx285 = getelementptr inbounds i32, i32* %126, i32 73
-  %127 = load i32, i32* %arrayidx285, align 4
-  %tobool286 = icmp ne i32 %127, 0
-  br i1 %tobool286, label %if.then287, label %if.end288
-
-if.then287:                                       ; preds = %if.end284
-  store i32 26130, i32* @f, align 4
-  br label %if.end288
-
-if.end288:                                        ; preds = %if.then287, %if.end284
-  %128 = load i32*, i32** @a, align 4
-  %arrayidx289 = getelementptr inbounds i32, i32* %128, i32 74
-  %129 = load i32, i32* %arrayidx289, align 4
-  %tobool290 = icmp ne i32 %129, 0
-  br i1 %tobool290, label %if.then291, label %if.end292
-
-if.then291:                                       ; preds = %if.end288
-  store i32 31934, i32* @f, align 4
-  br label %if.end292
-
-if.end292:                                        ; preds = %if.then291, %if.end288
-  %130 = load i32*, i32** @a, align 4
-  %arrayidx293 = getelementptr inbounds i32, i32* %130, i32 75
-  %131 = load i32, i32* %arrayidx293, align 4
-  %tobool294 = icmp ne i32 %131, 0
-  br i1 %tobool294, label %if.then295, label %if.end296
-
-if.then295:                                       ; preds = %if.end292
-  store i32 25862, i32* @f, align 4
-  br label %if.end296
-
-if.end296:                                        ; preds = %if.then295, %if.end292
-  %132 = load i32*, i32** @a, align 4
-  %arrayidx297 = getelementptr inbounds i32, i32* %132, i32 76
-  %133 = load i32, i32* %arrayidx297, align 4
-  %tobool298 = icmp ne i32 %133, 0
-  br i1 %tobool298, label %if.then299, label %if.end300
-
-if.then299:                                       ; preds = %if.end296
-  store i32 10642, i32* @f, align 4
-  br label %if.end300
-
-if.end300:                                        ; preds = %if.then299, %if.end296
-  %134 = load i32*, i32** @a, align 4
-  %arrayidx301 = getelementptr inbounds i32, i32* %134, i32 77
-  %135 = load i32, i32* %arrayidx301, align 4
-  %tobool302 = icmp ne i32 %135, 0
-  br i1 %tobool302, label %if.then303, label %if.end304
-
-if.then303:                                       ; preds = %if.end300
-  store i32 20209, i32* @f, align 4
-  br label %if.end304
-
-if.end304:                                        ; preds = %if.then303, %if.end300
-  %136 = load i32*, i32** @a, align 4
-  %arrayidx305 = getelementptr inbounds i32, i32* %136, i32 78
-  %137 = load i32, i32* %arrayidx305, align 4
-  %tobool306 = icmp ne i32 %137, 0
-  br i1 %tobool306, label %if.then307, label %if.end308
-
-if.then307:                                       ; preds = %if.end304
-  store i32 30889, i32* @f, align 4
-  br label %if.end308
-
-if.end308:                                        ; preds = %if.then307, %if.end304
-  %138 = load i32*, i32** @a, align 4
-  %arrayidx309 = getelementptr inbounds i32, i32* %138, i32 79
-  %139 = load i32, i32* %arrayidx309, align 4
-  %tobool310 = icmp ne i32 %139, 0
-  br i1 %tobool310, label %if.then311, label %if.end312
-
-if.then311:                                       ; preds = %if.end308
-  store i32 18688, i32* @f, align 4
-  br label %if.end312
-
-if.end312:                                        ; preds = %if.then311, %if.end308
-  %140 = load i32*, i32** @a, align 4
-  %arrayidx313 = getelementptr inbounds i32, i32* %140, i32 80
-  %141 = load i32, i32* %arrayidx313, align 4
-  %tobool314 = icmp ne i32 %141, 0
-  br i1 %tobool314, label %if.then315, label %if.end316
-
-if.then315:                                       ; preds = %if.end312
-  store i32 28726, i32* @f, align 4
-  br label %if.end316
-
-if.end316:                                        ; preds = %if.then315, %if.end312
-  %142 = load i32*, i32** @a, align 4
-  %arrayidx317 = getelementptr inbounds i32, i32* %142, i32 81
-  %143 = load i32, i32* %arrayidx317, align 4
-  %tobool318 = icmp ne i32 %143, 0
-  br i1 %tobool318, label %if.then319, label %if.end320
-
-if.then319:                                       ; preds = %if.end316
-  store i32 4266, i32* @f, align 4
-  br label %if.end320
-
-if.end320:                                        ; preds = %if.then319, %if.end316
-  %144 = load i32*, i32** @a, align 4
-  %arrayidx321 = getelementptr inbounds i32, i32* %144, i32 82
-  %145 = load i32, i32* %arrayidx321, align 4
-  %tobool322 = icmp ne i32 %145, 0
-  br i1 %tobool322, label %if.then323, label %if.end324
-
-if.then323:                                       ; preds = %if.end320
-  store i32 15461, i32* @f, align 4
-  br label %if.end324
-
-if.end324:                                        ; preds = %if.then323, %if.end320
-  %146 = load i32*, i32** @a, align 4
-  %arrayidx325 = getelementptr inbounds i32, i32* %146, i32 83
-  %147 = load i32, i32* %arrayidx325, align 4
-  %tobool326 = icmp ne i32 %147, 0
-  br i1 %tobool326, label %if.then327, label %if.end328
-
-if.then327:                                       ; preds = %if.end324
-  store i32 24716, i32* @f, align 4
-  br label %if.end328
-
-if.end328:                                        ; preds = %if.then327, %if.end324
-  br label %if.then331
-
-if.then331:                                       ; preds = %if.end328
-  store i32 18727, i32* @f, align 4
-  br label %if.end332
-
-if.end332:                                        ; preds = %if.then331
-  %148 = load i32*, i32** @a, align 4
-  %arrayidx333 = getelementptr inbounds i32, i32* %148, i32 85
-  %149 = load i32, i32* %arrayidx333, align 4
-  %tobool334 = icmp ne i32 %149, 0
-  br i1 %tobool334, label %if.then335, label %if.end336
-
-if.then335:                                       ; preds = %if.end332
-  store i32 29505, i32* @f, align 4
-  br label %if.end336
-
-if.end336:                                        ; preds = %if.then335, %if.end332
-  %150 = load i32*, i32** @a, align 4
-  %arrayidx337 = getelementptr inbounds i32, i32* %150, i32 86
-  %151 = load i32, i32* %arrayidx337, align 4
-  %tobool338 = icmp ne i32 %151, 0
-  br i1 %tobool338, label %if.then339, label %if.end340
-
-if.then339:                                       ; preds = %if.end336
-  store i32 27008, i32* @f, align 4
-  br label %if.end340
-
-if.end340:                                        ; preds = %if.then339, %if.end336
-  %152 = load i32*, i32** @a, align 4
-  %arrayidx341 = getelementptr inbounds i32, i32* %152, i32 87
-  %153 = load i32, i32* %arrayidx341, align 4
-  %tobool342 = icmp ne i32 %153, 0
-  br i1 %tobool342, label %if.then343, label %if.end344
-
-if.then343:                                       ; preds = %if.end340
-  store i32 6550, i32* @f, align 4
-  br label %if.end344
-
-if.end344:                                        ; preds = %if.then343, %if.end340
-  br label %if.then347
-
-if.then347:                                       ; preds = %if.end344
-  store i32 1117, i32* @f, align 4
-  br label %if.end348
-
-if.end348:                                        ; preds = %if.then347
-  %154 = load i32*, i32** @a, align 4
-  %arrayidx349 = getelementptr inbounds i32, i32* %154, i32 89
-  %155 = load i32, i32* %arrayidx349, align 4
-  %tobool350 = icmp ne i32 %155, 0
-  br i1 %tobool350, label %if.then351, label %if.end352
-
-if.then351:                                       ; preds = %if.end348
-  store i32 20118, i32* @f, align 4
-  br label %if.end352
-
-if.end352:                                        ; preds = %if.then351, %if.end348
-  %156 = load i32*, i32** @a, align 4
-  %arrayidx353 = getelementptr inbounds i32, i32* %156, i32 90
-  %157 = load i32, i32* %arrayidx353, align 4
-  %tobool354 = icmp ne i32 %157, 0
-  br i1 %tobool354, label %if.then355, label %if.end356
-
-if.then355:                                       ; preds = %if.end352
-  store i32 13650, i32* @f, align 4
-  br label %if.end356
-
-if.end356:                                        ; preds = %if.then355, %if.end352
-  br label %if.then359
-
-if.then359:                                       ; preds = %if.end356
-  store i32 18642, i32* @f, align 4
-  br label %if.end360
-
-if.end360:                                        ; preds = %if.then359
-  %158 = load i32*, i32** @a, align 4
-  %arrayidx361 = getelementptr inbounds i32, i32* %158, i32 92
-  %159 = load i32, i32* %arrayidx361, align 4
-  %tobool362 = icmp ne i32 %159, 0
-  br i1 %tobool362, label %if.then363, label %if.end364
-
-if.then363:                                       ; preds = %if.end360
-  store i32 30662, i32* @f, align 4
-  br label %if.end364
-
-if.end364:                                        ; preds = %if.then363, %if.end360
-  %160 = load i32*, i32** @a, align 4
-  %arrayidx365 = getelementptr inbounds i32, i32* %160, i32 93
-  %161 = load i32, i32* %arrayidx365, align 4
-  %tobool366 = icmp ne i32 %161, 0
-  br i1 %tobool366, label %if.then367, label %if.end368
-
-if.then367:                                       ; preds = %if.end364
-  store i32 8095, i32* @f, align 4
-  br label %if.end368
-
-if.end368:                                        ; preds = %if.then367, %if.end364
-  %162 = load i32*, i32** @a, align 4
-  %arrayidx369 = getelementptr inbounds i32, i32* %162, i32 94
-  %163 = load i32, i32* %arrayidx369, align 4
-  %tobool370 = icmp ne i32 %163, 0
-  br i1 %tobool370, label %if.then371, label %if.end372
-
-if.then371:                                       ; preds = %if.end368
-  store i32 8442, i32* @f, align 4
-  br label %if.end372
-
-if.end372:                                        ; preds = %if.then371, %if.end368
-  %164 = load i32*, i32** @a, align 4
-  %arrayidx373 = getelementptr inbounds i32, i32* %164, i32 95
-  %165 = load i32, i32* %arrayidx373, align 4
-  %tobool374 = icmp ne i32 %165, 0
-  br i1 %tobool374, label %if.then375, label %if.end376
-
-if.then375:                                       ; preds = %if.end372
-  store i32 8153, i32* @f, align 4
-  br label %if.end376
-
-if.end376:                                        ; preds = %if.then375, %if.end372
-  br label %if.then379
-
-if.then379:                                       ; preds = %if.end376
-  store i32 12965, i32* @f, align 4
-  br label %if.end380
-
-if.end380:                                        ; preds = %if.then379
-  %166 = load i32*, i32** @a, align 4
-  %arrayidx381 = getelementptr inbounds i32, i32* %166, i32 97
-  %167 = load i32, i32* %arrayidx381, align 4
-  %tobool382 = icmp ne i32 %167, 0
-  br i1 %tobool382, label %if.then383, label %if.end384
-
-if.then383:                                       ; preds = %if.end380
-  store i32 14277, i32* @f, align 4
-  br label %if.end384
-
-if.end384:                                        ; preds = %if.then383, %if.end380
-  br label %if.then387
-
-if.then387:                                       ; preds = %if.end384
-  store i32 1997, i32* @f, align 4
-  br label %if.end388
-
-if.end388:                                        ; preds = %if.then387
-  %168 = load i32*, i32** @a, align 4
-  %arrayidx389 = getelementptr inbounds i32, i32* %168, i32 99
-  %169 = load i32, i32* %arrayidx389, align 4
-  %tobool390 = icmp ne i32 %169, 0
-  br i1 %tobool390, label %if.then391, label %if.end392
-
-if.then391:                                       ; preds = %if.end388
-  store i32 31385, i32* @f, align 4
-  br label %if.end392
-
-if.end392:                                        ; preds = %if.then391, %if.end388
-  %170 = load i32*, i32** @a, align 4
-  %arrayidx393 = getelementptr inbounds i32, i32* %170, i32 100
-  %171 = load i32, i32* %arrayidx393, align 4
-  %tobool394 = icmp ne i32 %171, 0
-  br i1 %tobool394, label %if.then395, label %if.end396
-
-if.then395:                                       ; preds = %if.end392
-  store i32 8286, i32* @f, align 4
-  br label %if.end396
-
-if.end396:                                        ; preds = %if.then395, %if.end392
-  ret void
-}
diff --git a/llvm/test/CodeGen/Thumb/stack-mis-alignment.ll b/llvm/test/CodeGen/Thumb/stack-mis-alignment.ll
new file mode 100644
index 00000000000000..c000fb6a618eed
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/stack-mis-alignment.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 < %s | FileCheck %s
+
+; For noreturn function with StackAlignment 8 (function contains call/alloc),
+; check that lr is saved to keep the stack aligned.
+; CHECK: push    {lr}
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv5e-none-linux-gnueabi"
+
+define dso_local i32 @f() noreturn nounwind {
+entry:
+  call i32 @llvm.arm.space(i32 2048, i32 undef)
+  tail call i32 @exit(i32 0)
+  unreachable
+}
+
+declare i32 @llvm.arm.space(i32, i32)
+declare dso_local i32 @exit(i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index ebd93db9bdbe95..905b6d14bf080c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -1778,11 +1778,11 @@ for.body:                                         ; preds = %for.body, %for.body
 define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) {
 ; CHECK-LABEL: half_short_mac:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    cbz r2, .LBB11_3
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    subs r3, r2, #1
-; CHECK-NEXT:    and r7, r2, #3
+; CHECK-NEXT:    and r6, r2, #3
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhs .LBB11_4
 ; CHECK-NEXT:  @ %bb.2:
@@ -1799,33 +1799,33 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
 ; CHECK-NEXT:    vldr s0, .LCPI11_0
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
-; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    adds r3, r1, #4
+; CHECK-NEXT:    adds r2, r0, #4
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB11_5: @ %for.body
-; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r2, r1, r3
-; CHECK-NEXT:    adds r6, r0, r3
-; CHECK-NEXT:    vldr.16 s2, [r6, #6]
+; CHECK-NEXT:  @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrsh.w r4, [r3, #2]
+; CHECK-NEXT:    vldr.16 s2, [r2, #2]
 ; CHECK-NEXT:    add.w r12, r12, #4
-; CHECK-NEXT:    ldrsh.w r4, [r2, #2]
-; CHECK-NEXT:    ldrsh.w r5, [r2, #4]
-; CHECK-NEXT:    ldrsh.w r2, [r2, #6]
-; CHECK-NEXT:    vmov s8, r4
-; CHECK-NEXT:    vmov s6, r5
-; CHECK-NEXT:    vmov s4, r2
+; CHECK-NEXT:    vmov s4, r4
 ; CHECK-NEXT:    vcvt.f16.s32 s4, s4
+; CHECK-NEXT:    ldrsh.w r4, [r3]
 ; CHECK-NEXT:    vmul.f16 s2, s2, s4
-; CHECK-NEXT:    vldr.16 s4, [r6, #4]
+; CHECK-NEXT:    vldr.16 s4, [r2]
+; CHECK-NEXT:    vmov s6, r4
 ; CHECK-NEXT:    vcvt.f16.s32 s6, s6
+; CHECK-NEXT:    ldrsh r5, [r3, #-2]
+; CHECK-NEXT:    ldrsh r4, [r3, #-4]
 ; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vldr.16 s6, [r6, #2]
+; CHECK-NEXT:    vldr.16 s6, [r2, #-2]
+; CHECK-NEXT:    adds r3, #8
+; CHECK-NEXT:    vmov s8, r5
 ; CHECK-NEXT:    vcvt.f16.s32 s8, s8
-; CHECK-NEXT:    ldrsh r2, [r1, r3]
+; CHECK-NEXT:    vmov s10, r4
 ; CHECK-NEXT:    vmul.f16 s6, s6, s8
-; CHECK-NEXT:    vldr.16 s8, [r6]
-; CHECK-NEXT:    adds r3, #8
-; CHECK-NEXT:    vmov s10, r2
+; CHECK-NEXT:    vldr.16 s8, [r2, #-4]
 ; CHECK-NEXT:    vcvt.f16.s32 s10, s10
+; CHECK-NEXT:    adds r2, #8
 ; CHECK-NEXT:    vmul.f16 s8, s8, s10
 ; CHECK-NEXT:    vcvtb.f32.f16 s8, s8
 ; CHECK-NEXT:    vcvtb.f32.f16 s6, s6
@@ -1837,11 +1837,11 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    le lr, .LBB11_5
 ; CHECK-NEXT:  .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT:    wls lr, r7, .LBB11_9
+; CHECK-NEXT:    wls lr, r6, .LBB11_9
 ; CHECK-NEXT:  @ %bb.7: @ %for.body.epil.preheader
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
-; CHECK-NEXT:    mov lr, r7
+; CHECK-NEXT:    mov lr, r6
 ; CHECK-NEXT:  .LBB11_8: @ %for.body.epil
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrsh r2, [r1], #2
@@ -1854,7 +1854,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    le lr, .LBB11_8
 ; CHECK-NEXT:  .LBB11_9: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.10:
 ; CHECK-NEXT:  .LCPI11_0:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 5fd03a78132264..0b8a20e8256949 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -372,29 +372,29 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    beq.w .LBB5_11
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT:    add.w r4, r3, r12, lsl #2
-; CHECK-NEXT:    add.w r5, r1, r12
-; CHECK-NEXT:    cmp r4, r1
-; CHECK-NEXT:    add.w r6, r0, r12
-; CHECK-NEXT:    cset r7, hi
-; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    cset r5, hi
-; CHECK-NEXT:    cmp r4, r0
+; CHECK-NEXT:    add.w r6, r3, r12, lsl #2
+; CHECK-NEXT:    add.w r4, r1, r12
+; CHECK-NEXT:    cmp r6, r1
+; CHECK-NEXT:    add.w r5, r0, r12
+; CHECK-NEXT:    cset lr, hi
+; CHECK-NEXT:    cmp  r4, r3
 ; CHECK-NEXT:    cset r4, hi
-; CHECK-NEXT:    cmp r6, r3
+; CHECK-NEXT:    cmp  r6, r0
 ; CHECK-NEXT:    cset r6, hi
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    ands r6, r4
-; CHECK-NEXT:    lsls r6, r6, #31
-; CHECK-NEXT:    itt eq
-; CHECK-NEXT:    andeq.w r4, r5, r7
-; CHECK-NEXT:    lslseq.w r4, r4, #31
-; CHECK-NEXT:    beq .LBB5_4
+; CHECK-NEXT:    cmp  r5, r3
+; CHECK-NEXT:    cset r5, hi
+; CHECK-NEXT:    ands r5, r6
+; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    lsls r5, r5, #31
+; CHECK-NEXT:    itt  eq
+; CHECK-NEXT:    andeq.w r5, r4, lr
+; CHECK-NEXT:    lslseq.w r5, r5, #31
+; CHECK-NEXT:    beq     .LBB5_4
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT:    sub.w r4, r12, #1
-; CHECK-NEXT:    and r9, r12, #3
-; CHECK-NEXT:    cmp r4, #3
-; CHECK-NEXT:    bhs .LBB5_6
+; CHECK-NEXT:   sub.w r5, r12, #1
+; CHECK-NEXT:   and r9, r12, #3
+; CHECK-NEXT:   cmp r5, #3
+; CHECK-NEXT:   bhs .LBB5_6
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    b .LBB5_8
@@ -409,35 +409,37 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    letp lr, .LBB5_5
 ; CHECK-NEXT:    b .LBB5_11
 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader.new
-; CHECK-NEXT:    bic r7, r12, #3
+; CHECK-NEXT:    bic r5, r12, #3
 ; CHECK-NEXT:    add.w r4, r3, #8
-; CHECK-NEXT:    subs r7, #4
+; CHECK-NEXT:    subs r5, #4
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, lr, r7, lsr #2
+; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
+; CHECK-NEXT:    adds r5, r0, #3
+; CHECK-NEXT:    adds r6, r1, #1
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_7: @ %for.body
-; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrb.w r5, [r0, r12]
-; CHECK-NEXT:    add.w r7, r1, r12
-; CHECK-NEXT:    ldrb.w r6, [r1, r12]
-; CHECK-NEXT:    smlabb r5, r6, r5, r2
-; CHECK-NEXT:    str r5, [r4, #-8]
-; CHECK-NEXT:    add.w r5, r0, r12
-; CHECK-NEXT:    ldrb r6, [r7, #1]
-; CHECK-NEXT:    add.w r12, r12, #4
-; CHECK-NEXT:    ldrb.w r8, [r5, #1]
-; CHECK-NEXT:    smlabb r6, r6, r8, r2
-; CHECK-NEXT:    str r6, [r4, #-4]
-; CHECK-NEXT:    ldrb.w r8, [r5, #2]
-; CHECK-NEXT:    ldrb r6, [r7, #2]
-; CHECK-NEXT:    smlabb r6, r6, r8, r2
-; CHECK-NEXT:    str r6, [r4]
-; CHECK-NEXT:    ldrb r5, [r5, #3]
-; CHECK-NEXT:    ldrb r6, [r7, #3]
-; CHECK-NEXT:    smlabb r5, r6, r5, r2
-; CHECK-NEXT:    str r5, [r4, #4]
-; CHECK-NEXT:    adds r4, #16
-; CHECK-NEXT:    le lr, .LBB5_7
+; CHECK-NEXT:   @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   ldrb r8, [r5, #-3]
+; CHECK-NEXT:   add.w r12, r12, #4
+; CHECK-NEXT:   ldrb r7, [r6, #-1]
+; CHECK-NEXT:   smlabb r7, r7, r8, r2
+; CHECK-NEXT:   str r7, [r4, #-8]
+; CHECK-NEXT:   ldrb r8, [r5, #-2]
+; CHECK-NEXT:   ldrb r7, [r6]
+; CHECK-NEXT:   smlabb r7, r7, r8, r2
+; CHECK-NEXT:   str r7, [r4, #-4]
+; CHECK-NEXT:   ldrb r8, [r5, #-1]
+; CHECK-NEXT:   ldrb r7, [r6, #1]
+; CHECK-NEXT:   smlabb r7, r7, r8, r2
+; CHECK-NEXT:   str r7, [r4]
+; CHECK-NEXT:   ldrb.w r8, [r5]
+; CHECK-NEXT:   adds r5, #4
+; CHECK-NEXT:   ldrb r7, [r6, #2]
+; CHECK-NEXT:   adds r6, #4
+; CHECK-NEXT:   smlabb r7, r7, r8, r2
+; CHECK-NEXT:   str r7, [r4, #4]
+; CHECK-NEXT:   adds r4, #16
+; CHECK-NEXT:   le lr, .LBB5_7
 ; CHECK-NEXT:  .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa
 ; CHECK-NEXT:    wls lr, r9, .LBB5_11
 ; CHECK-NEXT:  @ %bb.9: @ %for.body.epil.preheader
@@ -447,10 +449,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    mov lr, r9
 ; CHECK-NEXT:  .LBB5_10: @ %for.body.epil
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrb r7, [r0], #1
-; CHECK-NEXT:    ldrb r6, [r1], #1
-; CHECK-NEXT:    smlabb r7, r6, r7, r2
-; CHECK-NEXT:    str r7, [r3], #4
+; CHECK-NEXT:    ldrb r6, [r0], #1
+; CHECK-NEXT:    ldrb r5, [r1], #1
+; CHECK-NEXT:    smlabb r6, r5, r6, r2
+; CHECK-NEXT:    str r6, [r3], #4
 ; CHECK-NEXT:    le lr, .LBB5_10
 ; CHECK-NEXT:  .LBB5_11: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
@@ -663,28 +665,28 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    beq.w .LBB7_11
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT:    add.w r4, r3, r12, lsl #2
-; CHECK-NEXT:    add.w r5, r1, r12
-; CHECK-NEXT:    cmp r4, r1
-; CHECK-NEXT:    add.w r6, r0, r12
-; CHECK-NEXT:    cset r7, hi
-; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    cset r5, hi
-; CHECK-NEXT:    cmp r4, r0
+; CHECK-NEXT:    add.w r6, r3, r12, lsl #2
+; CHECK-NEXT:    add.w r4, r1, r12
+; CHECK-NEXT:    cmp r6, r1
+; CHECK-NEXT:    add.w r5, r0, r12
+; CHECK-NEXT:    cset lr, hi
+; CHECK-NEXT:    cmp r4, r3
 ; CHECK-NEXT:    cset r4, hi
-; CHECK-NEXT:    cmp r6, r3
+; CHECK-NEXT:    cmp r6, r0
 ; CHECK-NEXT:    cset r6, hi
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    ands r6, r4
-; CHECK-NEXT:    lsls r6, r6, #31
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    cset r5, hi
+; CHECK-NEXT:    ands r5, r6
+; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    lsls r5, r5, #31
 ; CHECK-NEXT:    itt eq
-; CHECK-NEXT:    andeq.w r4, r5, r7
-; CHECK-NEXT:    lslseq.w r4, r4, #31
+; CHECK-NEXT:    andeq.w r5, r4, lr
+; CHECK-NEXT:    lslseq.w r5, r5, #31
 ; CHECK-NEXT:    beq .LBB7_4
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT:    sub.w r4, r12, #1
+; CHECK-NEXT:    sub.w r5, r12, #1
 ; CHECK-NEXT:    and r9, r12, #3
-; CHECK-NEXT:    cmp r4, #3
+; CHECK-NEXT:    cmp r5, #3
 ; CHECK-NEXT:    bhs .LBB7_6
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:    mov.w r12, #0
@@ -700,33 +702,35 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    letp lr, .LBB7_5
 ; CHECK-NEXT:    b .LBB7_11
 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader.new
-; CHECK-NEXT:    bic r7, r12, #3
+; CHECK-NEXT:    bic r5, r12, #3
 ; CHECK-NEXT:    add.w r4, r3, #8
-; CHECK-NEXT:    subs r7, #4
+; CHECK-NEXT:    subs r5, #4
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    add.w lr, lr, r7, lsr #2
+; CHECK-NEXT:    add.w lr, r6, r5, lsr #2
+; CHECK-NEXT:    adds r5, r0, #3
+; CHECK-NEXT:    adds r6, r1, #1
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrb.w r5, [r0, r12]
-; CHECK-NEXT:    add.w r7, r1, r12
-; CHECK-NEXT:    ldrb.w r6, [r1, r12]
-; CHECK-NEXT:    smlabb r5, r6, r5, r2
-; CHECK-NEXT:    str r5, [r4, #-8]
-; CHECK-NEXT:    add.w r5, r0, r12
-; CHECK-NEXT:    ldrb r6, [r7, #1]
+; CHECK-NEXT:    ldrb r8, [r5, #-3]
 ; CHECK-NEXT:    add.w r12, r12, #4
-; CHECK-NEXT:    ldrb.w r8, [r5, #1]
-; CHECK-NEXT:    smlabb r6, r6, r8, r2
-; CHECK-NEXT:    str r6, [r4, #-4]
-; CHECK-NEXT:    ldrb.w r8, [r5, #2]
-; CHECK-NEXT:    ldrb r6, [r7, #2]
-; CHECK-NEXT:    smlabb r6, r6, r8, r2
-; CHECK-NEXT:    str r6, [r4]
-; CHECK-NEXT:    ldrb r5, [r5, #3]
-; CHECK-NEXT:    ldrb r6, [r7, #3]
-; CHECK-NEXT:    smlabb r5, r6, r5, r2
-; CHECK-NEXT:    str r5, [r4, #4]
+; CHECK-NEXT:    ldrb r7, [r6, #-1]
+; CHECK-NEXT:    smlabb r7, r7, r8, r2
+; CHECK-NEXT:    str r7, [r4, #-8]
+; CHECK-NEXT:    ldrb r8, [r5, #-2]
+; CHECK-NEXT:    ldrb r7, [r6]
+; CHECK-NEXT:    smlabb r7, r7, r8, r2
+; CHECK-NEXT:    str r7, [r4, #-4]
+; CHECK-NEXT:    ldrb r8, [r5, #-1]
+; CHECK-NEXT:    ldrb r7, [r6, #1]
+; CHECK-NEXT:    smlabb r7, r7, r8, r2
+; CHECK-NEXT:    str r7, [r4]
+; CHECK-NEXT:    ldrb.w r8, [r5]
+; CHECK-NEXT:    adds r5, #4
+; CHECK-NEXT:    ldrb r7, [r6, #2]
+; CHECK-NEXT:    adds r6, #4
+; CHECK-NEXT:    smlabb r7, r7, r8, r2
+; CHECK-NEXT:    str r7, [r4, #4]
 ; CHECK-NEXT:    adds r4, #16
 ; CHECK-NEXT:    le lr, .LBB7_7
 ; CHECK-NEXT:  .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa
@@ -738,10 +742,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    mov lr, r9
 ; CHECK-NEXT:  .LBB7_10: @ %for.body.epil
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrb r7, [r0], #1
-; CHECK-NEXT:    ldrb r6, [r1], #1
-; CHECK-NEXT:    smlabb r7, r6, r7, r2
-; CHECK-NEXT:    str r7, [r3], #4
+; CHECK-NEXT:    ldrb r6, [r0], #1
+; CHECK-NEXT:    ldrb r5, [r1], #1
+; CHECK-NEXT:    smlabb r6, r5, r6, r2
+; CHECK-NEXT:    str r6, [r3], #4
 ; CHECK-NEXT:    le lr, .LBB7_10
 ; CHECK-NEXT:  .LBB7_11: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
new file mode 100644
index 00000000000000..d74f3bbfb2e0e3
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -0,0 +1,698 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
+
+; Check some LSR loop postinc
+
+; fma loop with a destination that is the same as one of the sources
+define void @fma(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: fma:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    blt .LBB0_8
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    cmp r3, #3
+; CHECK-NEXT:    bhi .LBB0_3
+; CHECK-NEXT:  @ %bb.2:
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    b .LBB0_6
+; CHECK-NEXT:  .LBB0_3: @ %vector.ph
+; CHECK-NEXT:    bic r12, r3, #3
+; CHECK-NEXT:    movs r5, #1
+; CHECK-NEXT:    sub.w r6, r12, #4
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB0_4: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r4], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r5], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r6]
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vstrb.8 q2, [r6], #16
+; CHECK-NEXT:    le lr, .LBB0_4
+; CHECK-NEXT:  @ %bb.5: @ %middle.block
+; CHECK-NEXT:    cmp r12, r3
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader12
+; CHECK-NEXT:    sub.w lr, r3, r12
+; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
+; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
+; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB0_7: @ %for.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldr s0, [r0]
+; CHECK-NEXT:    adds r0, #4
+; CHECK-NEXT:    vldr s2, [r1]
+; CHECK-NEXT:    adds r1, #4
+; CHECK-NEXT:    vldr s4, [r2]
+; CHECK-NEXT:    vfma.f32 s4, s2, s0
+; CHECK-NEXT:    vstr s4, [r2]
+; CHECK-NEXT:    adds r2, #4
+; CHECK-NEXT:    le lr, .LBB0_7
+; CHECK-NEXT:  .LBB0_8: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+entry:
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %min.iters.check = icmp ult i32 %n, 4
+  br i1 %min.iters.check, label %for.body.preheader12, label %vector.ph
+
+for.body.preheader12:                             ; preds = %middle.block, %for.body.preheader
+  %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  br label %for.body
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, float* %A, i32 %index
+  %1 = bitcast float* %0 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %1, align 4
+  %2 = getelementptr inbounds float, float* %B, i32 %index
+  %3 = bitcast float* %2 to <4 x float>*
+  %wide.load10 = load <4 x float>, <4 x float>* %3, align 4
+  %4 = fmul fast <4 x float> %wide.load10, %wide.load
+  %5 = getelementptr inbounds float, float* %C, i32 %index
+  %6 = bitcast float* %5 to <4 x float>*
+  %wide.load11 = load <4 x float>, <4 x float>* %6, align 4
+  %7 = fadd fast <4 x float> %wide.load11, %4
+  %8 = bitcast float* %5 to <4 x float>*
+  store <4 x float> %7, <4 x float>* %8, align 4
+  %index.next = add i32 %index, 4
+  %9 = icmp eq i32 %index.next, %n.vec
+  br i1 %9, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i32 %n.vec, %n
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader12
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader12, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader12 ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.09
+  %10 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.09
+  %11 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %11, %10
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.09
+  %12 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %12, %mul
+  store float %add, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+
+; Same as above but tail predicated
+; FIXME: The postinc here is put on the load, not the store. An extra mov is needed in the loop because of it.
+define void @fma_tailpred(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: fma_tailpred:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    blt .LBB1_3
+; CHECK-NEXT:  @ %bb.1: @ %vector.ph
+; CHECK-NEXT:    add.w r12, r3, #3
+; CHECK-NEXT:    adr r4, .LCPI1_0
+; CHECK-NEXT:    bic r12, r12, #3
+; CHECK-NEXT:    mov.w lr, #1
+; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    subs r3, #1
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    vdup.32 q1, r3
+; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB1_2: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vdup.32 q2, r12
+; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    vorr q2, q2, q0
+; CHECK-NEXT:    vpttt.u32 cs, q1, q2
+; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
+; CHECK-NEXT:    vldrwt.u32 q3, [r1], #16
+; CHECK-NEXT:    vldrwt.u32 q4, [r3], #16
+; CHECK-NEXT:    vfma.f32 q4, q3, q2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q4, [r2]
+; CHECK-NEXT:    mov r2, r3
+; CHECK-NEXT:    le lr, .LBB1_2
+; CHECK-NEXT:  .LBB1_3: @ %for.cond.cleanup
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.4:
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+entry:
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %n, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %n, -1
+  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds float, float* %A, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
+  %2 = bitcast float* %0 to <4 x float>*
+  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
+  %3 = getelementptr inbounds float, float* %B, i32 %index
+  %4 = bitcast float* %3 to <4 x float>*
+  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
+  %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
+  %6 = getelementptr inbounds float, float* %C, i32 %index
+  %7 = bitcast float* %6 to <4 x float>*
+  %wide.masked.load13 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %1, <4 x float> undef)
+  %8 = fadd fast <4 x float> %wide.masked.load13, %5
+  %9 = bitcast float* %6 to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %8, <4 x float>* %9, i32 4, <4 x i1> %1)
+  %index.next = add i32 %index, 4
+  %10 = icmp eq i32 %index.next, %n.vec
+  br i1 %10, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+
+; Multiple loads of the loop with a common base
+define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %input_col, i16 zeroext %output_ch, i16 zeroext %num_cols, i32 %col_offset, i16 signext %activation_min, i16 zeroext %row_len, i32* nocapture readonly %bias, i8* returned %out) {
+; CHECK-LABEL: test:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #20
+; CHECK-NEXT:    sub sp, #20
+; CHECK-NEXT:    cmp r3, #4
+; CHECK-NEXT:    strd r0, r1, [sp, #12] @ 8-byte Folded Spill
+; CHECK-NEXT:    bne .LBB2_8
+; CHECK-NEXT:  @ %bb.1: @ %for.cond.preheader
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    beq .LBB2_8
+; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
+; CHECK-NEXT:    ldr r3, [sp, #64]
+; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr.w r9, [sp, #56]
+; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
+; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    adds r0, r1, r3
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
+; CHECK-NEXT:    adds r0, r3, #7
+; CHECK-NEXT:    lsrs r0, r0, #3
+; CHECK-NEXT:    b .LBB2_5
+; CHECK-NEXT:  .LBB2_3: @ in Loop: Header=BB2_5 Depth=1
+; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:  .LBB2_4: @ %for.cond.cleanup23
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
+; CHECK-NEXT:    ldr r3, [sp, #72]
+; CHECK-NEXT:    add.w r1, r10, r8
+; CHECK-NEXT:    add r1, r6
+; CHECK-NEXT:    add r1, r12
+; CHECK-NEXT:    strb.w r1, [r3, r11]
+; CHECK-NEXT:    add.w r11, r11, #1
+; CHECK-NEXT:    cmp r11, r2
+; CHECK-NEXT:    beq .LBB2_8
+; CHECK-NEXT:  .LBB2_5: @ %for.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB2_7 Depth 2
+; CHECK-NEXT:    ldr r1, [sp, #68]
+; CHECK-NEXT:    subs.w lr, r0, r0
+; CHECK-NEXT:    ldr.w r12, [r1, r11, lsl #2]
+; CHECK-NEXT:    ble .LBB2_3
+; CHECK-NEXT:  @ %bb.6: @ %for.body24.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
+; CHECK-NEXT:    ldr r3, [sp, #64]
+; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    mla r7, r11, r3, r1
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:  .LBB2_7: @ %for.body24
+; CHECK-NEXT:    @ Parent Loop BB2_5 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
+; CHECK-NEXT:    vadd.i16 q1, q0, r9
+; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
+; CHECK-NEXT:    vmlava.s16 r12, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vmlava.s16 r8, q0, q1
+; CHECK-NEXT:    le lr, .LBB2_7
+; CHECK-NEXT:    b .LBB2_4
+; CHECK-NEXT:  .LBB2_8: @ %if.end
+; CHECK-NEXT:    ldr r0, [sp, #72]
+; CHECK-NEXT:    add sp, #20
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+  %cmp = icmp eq i16 %num_cols, 4
+  br i1 %cmp, label %for.cond.preheader, label %if.end
+
+for.cond.preheader:                               ; preds = %entry
+  %conv2 = zext i16 %output_ch to i32
+  %cmp3114 = icmp eq i16 %output_ch, 0
+  br i1 %cmp3114, label %if.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %for.cond.preheader
+  %conv5 = zext i16 %row_len to i32
+  %add.ptr9 = getelementptr inbounds i8, i8* %input_col, i32 %conv5
+  %mul11 = shl nuw nsw i32 %conv5, 1
+  %add.ptr12 = getelementptr inbounds i8, i8* %input_col, i32 %mul11
+  %mul14 = mul nuw nsw i32 %conv5, 3
+  %add.ptr15 = getelementptr inbounds i8, i8* %input_col, i32 %mul14
+  %add = add nuw nsw i32 %conv5, 7
+  %div = lshr i32 %add, 3
+  %conv25 = trunc i32 %col_offset to i16
+  %.splatinsert = insertelement <8 x i16> undef, i16 %conv25, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond.cleanup23, %for.body.lr.ph
+  %i_out_ch.0116 = phi i32 [ 0, %for.body.lr.ph ], [ %inc37, %for.cond.cleanup23 ]
+  %i_row_loop.0115 = phi i32 [ undef, %for.body.lr.ph ], [ %i_row_loop.1.lcssa, %for.cond.cleanup23 ]
+  %arrayidx = getelementptr inbounds i32, i32* %bias, i32 %i_out_ch.0116
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp2199 = icmp slt i32 %i_row_loop.0115, %div
+  br i1 %cmp2199, label %for.body24.preheader, label %for.cond.cleanup23
+
+for.body24.preheader:                             ; preds = %for.body
+  %mul = mul nuw nsw i32 %i_out_ch.0116, %conv5
+  %add.ptr = getelementptr inbounds i8, i8* %input_row, i32 %mul
+  br label %for.body24
+
+for.cond.cleanup23:                               ; preds = %for.body24, %for.body
+  %acc_0.0.lcssa = phi i32 [ %0, %for.body ], [ %20, %for.body24 ]
+  %acc_1.0.lcssa = phi i32 [ %0, %for.body ], [ %21, %for.body24 ]
+  %acc_2.0.lcssa = phi i32 [ %0, %for.body ], [ %22, %for.body24 ]
+  %acc_3.0.lcssa = phi i32 [ %0, %for.body ], [ %23, %for.body24 ]
+  %i_row_loop.1.lcssa = phi i32 [ %i_row_loop.0115, %for.body ], [ %div, %for.body24 ]
+  %add31 = add nsw i32 %acc_1.0.lcssa, %acc_0.0.lcssa
+  %add32 = add nsw i32 %add31, %acc_2.0.lcssa
+  %add33 = add nsw i32 %add32, %acc_3.0.lcssa
+  %conv34 = trunc i32 %add33 to i8
+  %arrayidx35 = getelementptr inbounds i8, i8* %out, i32 %i_out_ch.0116
+  store i8 %conv34, i8* %arrayidx35, align 1
+  %inc37 = add nuw nsw i32 %i_out_ch.0116, 1
+  %exitcond120 = icmp eq i32 %inc37, %conv2
+  br i1 %exitcond120, label %if.end, label %for.body
+
+for.body24:                                       ; preds = %for.body24, %for.body24.preheader
+  %ip_r0.0109 = phi i8* [ %add.ptr26, %for.body24 ], [ %add.ptr, %for.body24.preheader ]
+  %ip_c0.0108 = phi i8* [ %add.ptr27, %for.body24 ], [ %input_col, %for.body24.preheader ]
+  %ip_c1.0107 = phi i8* [ %add.ptr28, %for.body24 ], [ %add.ptr9, %for.body24.preheader ]
+  %ip_c2.0106 = phi i8* [ %add.ptr29, %for.body24 ], [ %add.ptr12, %for.body24.preheader ]
+  %i_row_loop.1105 = phi i32 [ %inc, %for.body24 ], [ %i_row_loop.0115, %for.body24.preheader ]
+  %ip_c3.0104 = phi i8* [ %add.ptr30, %for.body24 ], [ %add.ptr15, %for.body24.preheader ]
+  %acc_3.0103 = phi i32 [ %23, %for.body24 ], [ %0, %for.body24.preheader ]
+  %acc_2.0102 = phi i32 [ %22, %for.body24 ], [ %0, %for.body24.preheader ]
+  %acc_1.0101 = phi i32 [ %21, %for.body24 ], [ %0, %for.body24.preheader ]
+  %acc_0.0100 = phi i32 [ %20, %for.body24 ], [ %0, %for.body24.preheader ]
+  %1 = bitcast i8* %ip_r0.0109 to <8 x i8>*
+  %2 = load <8 x i8>, <8 x i8>* %1, align 1
+  %3 = sext <8 x i8> %2 to <8 x i16>
+  %add.ptr26 = getelementptr inbounds i8, i8* %ip_r0.0109, i32 8
+  %4 = bitcast i8* %ip_c0.0108 to <8 x i8>*
+  %5 = load <8 x i8>, <8 x i8>* %4, align 1
+  %6 = sext <8 x i8> %5 to <8 x i16>
+  %add.ptr27 = getelementptr inbounds i8, i8* %ip_c0.0108, i32 8
+  %7 = add <8 x i16> %.splat, %6
+  %8 = bitcast i8* %ip_c1.0107 to <8 x i8>*
+  %9 = load <8 x i8>, <8 x i8>* %8, align 1
+  %10 = sext <8 x i8> %9 to <8 x i16>
+  %add.ptr28 = getelementptr inbounds i8, i8* %ip_c1.0107, i32 8
+  %11 = add <8 x i16> %.splat, %10
+  %12 = bitcast i8* %ip_c2.0106 to <8 x i8>*
+  %13 = load <8 x i8>, <8 x i8>* %12, align 1
+  %14 = sext <8 x i8> %13 to <8 x i16>
+  %add.ptr29 = getelementptr inbounds i8, i8* %ip_c2.0106, i32 8
+  %15 = add <8 x i16> %.splat, %14
+  %16 = bitcast i8* %ip_c3.0104 to <8 x i8>*
+  %17 = load <8 x i8>, <8 x i8>* %16, align 1
+  %18 = sext <8 x i8> %17 to <8 x i16>
+  %add.ptr30 = getelementptr inbounds i8, i8* %ip_c3.0104, i32 8
+  %19 = add <8 x i16> %.splat, %18
+  %20 = tail call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %acc_0.0100, <8 x i16> %3, <8 x i16> %7)
+  %21 = tail call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %acc_1.0101, <8 x i16> %3, <8 x i16> %11)
+  %22 = tail call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %acc_2.0102, <8 x i16> %3, <8 x i16> %15)
+  %23 = tail call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %acc_3.0103, <8 x i16> %3, <8 x i16> %19)
+  %inc = add nsw i32 %i_row_loop.1105, 1
+  %exitcond = icmp eq i32 %inc, %div
+  br i1 %exitcond, label %for.cond.cleanup23, label %for.body24
+
+if.end:                                           ; preds = %for.cond.cleanup23, %for.cond.preheader, %entry
+  ret i8* %out
+}
+
+; Same as above with optsize
+define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readonly %input_col, i16 zeroext %output_ch, i16 zeroext %num_cols, i32 %col_offset, i16 signext %activation_min, i16 zeroext %row_len, i32* nocapture readonly %bias, i8* returned %out) optsize {
+; CHECK-LABEL: test_optsize:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #20
+; CHECK-NEXT:    sub sp, #20
+; CHECK-NEXT:    cmp r3, #4
+; CHECK-NEXT:    strd r0, r1, [sp, #12] @ 8-byte Folded Spill
+; CHECK-NEXT:    bne .LBB3_8
+; CHECK-NEXT:  @ %bb.1: @ %for.cond.preheader
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    beq .LBB3_8
+; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
+; CHECK-NEXT:    ldr r3, [sp, #64]
+; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr.w r9, [sp, #56]
+; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
+; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    adds r0, r1, r3
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
+; CHECK-NEXT:    adds r0, r3, #7
+; CHECK-NEXT:    lsrs r0, r0, #3
+; CHECK-NEXT:  .LBB3_3: @ %for.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB3_5 Depth 2
+; CHECK-NEXT:    ldr r1, [sp, #68]
+; CHECK-NEXT:    subs.w lr, r0, r0
+; CHECK-NEXT:    ldr.w r12, [r1, r11, lsl #2]
+; CHECK-NEXT:    ble .LBB3_6
+; CHECK-NEXT:  @ %bb.4: @ %for.body24.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB3_3 Depth=1
+; CHECK-NEXT:    ldr r3, [sp, #64]
+; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    mla r7, r11, r3, r1
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:  .LBB3_5: @ %for.body24
+; CHECK-NEXT:    @ Parent Loop BB3_3 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
+; CHECK-NEXT:    vadd.i16 q1, q0, r9
+; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
+; CHECK-NEXT:    vmlava.s16 r12, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vmlava.s16 r8, q0, q1
+; CHECK-NEXT:    le lr, .LBB3_5
+; CHECK-NEXT:    b .LBB3_7
+; CHECK-NEXT:  .LBB3_6: @ in Loop: Header=BB3_3 Depth=1
+; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:  .LBB3_7: @ %for.cond.cleanup23
+; CHECK-NEXT:    @ in Loop: Header=BB3_3 Depth=1
+; CHECK-NEXT:    ldr r3, [sp, #72]
+; CHECK-NEXT:    add.w r1, r10, r8
+; CHECK-NEXT:    add r1, r6
+; CHECK-NEXT:    add r1, r12
+; CHECK-NEXT:    strb.w r1, [r3, r11]
+; CHECK-NEXT:    add.w r11, r11, #1
+; CHECK-NEXT:    cmp r11, r2
+; CHECK-NEXT:    bne .LBB3_3
+; CHECK-NEXT:  .LBB3_8: @ %if.end
+; CHECK-NEXT:    ldr r0, [sp, #72]
+; CHECK-NEXT:    add sp, #20
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+  %cmp = icmp eq i16 %num_cols, 4
+  br i1 %cmp, label %for.cond.preheader, label %if.end
+
+for.cond.preheader:                               ; preds = %entry
+  %conv2 = zext i16 %output_ch to i32
+  %cmp3114 = icmp eq i16 %output_ch, 0
+  br i1 %cmp3114, label %if.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %for.cond.preheader
+  %conv5 = zext i16 %row_len to i32
+  %add.ptr9 = getelementptr inbounds i8, i8* %input_col, i32 %conv5
+  %mul11 = shl nuw nsw i32 %conv5, 1
+  %add.ptr12 = getelementptr inbounds i8, i8* %input_col, i32 %mul11
+  %mul14 = mul nuw nsw i32 %conv5, 3
+  %add.ptr15 = getelementptr inbounds i8, i8* %input_col, i32 %mul14
+  %add = add nuw nsw i32 %conv5, 7
+  %div = lshr i32 %add, 3
+  %conv25 = trunc i32 %col_offset to i16
+  %.splatinsert = insertelement <8 x i16> undef, i16 %conv25, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond.cleanup23, %for.body.lr.ph
+  %i_out_ch.0116 = phi i32 [ 0, %for.body.lr.ph ], [ %inc37, %for.cond.cleanup23 ]
+  %i_row_loop.0115 = phi i32 [ undef, %for.body.lr.ph ], [ %i_row_loop.1.lcssa, %for.cond.cleanup23 ]
+  %arrayidx = getelementptr inbounds i32, i32* %bias, i32 %i_out_ch.0116
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp2199 = icmp slt i32 %i_row_loop.0115, %div
+  br i1 %cmp2199, label %for.body24.preheader, label %for.cond.cleanup23
+
+for.body24.preheader:                             ; preds = %for.body
+  %mul = mul nuw nsw i32 %i_out_ch.0116, %conv5
+  %add.ptr = getelementptr inbounds i8, i8* %input_row, i32 %mul
+  br label %for.body24
+
+for.cond.cleanup23:                               ; preds = %for.body24, %for.body
+  %acc_0.0.lcssa = phi i32 [ %0, %for.body ], [ %20, %for.body24 ]
+  %acc_1.0.lcssa = phi i32 [ %0, %for.body ], [ %21, %for.body24 ]
+  %acc_2.0.lcssa = phi i32 [ %0, %for.body ], [ %22, %for.body24 ]
+  %acc_3.0.lcssa = phi i32 [ %0, %for.body ], [ %23, %for.body24 ]
+  %i_row_loop.1.lcssa = phi i32 [ %i_row_loop.0115, %for.body ], [ %div, %for.body24 ]
+  %add31 = add nsw i32 %acc_1.0.lcssa, %acc_0.0.lcssa
+  %add32 = add nsw i32 %add31, %acc_2.0.lcssa
+  %add33 = add nsw i32 %add32, %acc_3.0.lcssa
+  %conv34 = trunc i32 %add33 to i8
+  %arrayidx35 = getelementptr inbounds i8, i8* %out, i32 %i_out_ch.0116
+  store i8 %conv34, i8* %arrayidx35, align 1
+  %inc37 = add nuw nsw i32 %i_out_ch.0116, 1
+  %exitcond120 = icmp eq i32 %inc37, %conv2
+  br i1 %exitcond120, label %if.end, label %for.body
+
+for.body24:                                       ; preds = %for.body24, %for.body24.preheader
+  %ip_r0.0109 = phi i8* [ %add.ptr26, %for.body24 ], [ %add.ptr, %for.body24.preheader ]
+  %ip_c0.0108 = phi i8* [ %add.ptr27, %for.body24 ], [ %input_col, %for.body24.preheader ]
+  %ip_c1.0107 = phi i8* [ %add.ptr28, %for.body24 ], [ %add.ptr9, %for.body24.preheader ]
+  %ip_c2.0106 = phi i8* [ %add.ptr29, %for.body24 ], [ %add.ptr12, %for.body24.preheader ]
+  %i_row_loop.1105 = phi i32 [ %inc, %for.body24 ], [ %i_row_loop.0115, %for.body24.preheader ]
+  %ip_c3.0104 = phi i8* [ %add.ptr30, %for.body24 ], [ %add.ptr15, %for.body24.preheader ]
+  %acc_3.0103 = phi i32 [ %23, %for.body24 ], [ %0, %for.body24.preheader ]
+  %acc_2.0102 = phi i32 [ %22, %for.body24 ], [ %0, %for.body24.preheader ]
+  %acc_1.0101 = phi i32 [ %21, %for.body24 ], [ %0, %for.body24.preheader ]
+  %acc_0.0100 = phi i32 [ %20, %for.body24 ], [ %0, %for.body24.preheader ]
+  %1 = bitcast i8* %ip_r0.0109 to <8 x i8>*
+  %2 = load <8 x i8>, <8 x i8>* %1, align 1
+  %3 = sext <8 x i8> %2 to <8 x i16>
+  %add.ptr26 = getelementptr inbounds i8, i8* %ip_r0.0109, i32 8
+  %4 = bitcast i8* %ip_c0.0108 to <8 x i8>*
+  %5 = load <8 x i8>, <8 x i8>* %4, align 1
+  %6 = sext <8 x i8> %5 to <8 x i16>
+  %add.ptr27 = getelementptr inbounds i8, i8* %ip_c0.0108, i32 8
+  %7 = add <8 x i16> %.splat, %6
+  %8 = bitcast i8* %ip_c1.0107 to <8 x i8>*
+  %9 = load <8 x i8>, <8 x i8>* %8, align 1
+  %10 = sext <8 x i8> %9 to <8 x i16>
+  %add.ptr28 = getelementptr inbounds i8, i8* %ip_c1.0107, i32 8
+  %11 = add <8 x i16> %.splat, %10
+  %12 = bitcast i8* %ip_c2.0106 to <8 x i8>*
+  %13 = load <8 x i8>, <8 x i8>* %12, align 1
+  %14 = sext <8 x i8> %13 to <8 x i16>
+  %add.ptr29 = getelementptr inbounds i8, i8* %ip_c2.0106, i32 8
+  %15 = add <8 x i16> %.splat, %14
+  %16 = bitcast i8* %ip_c3.0104 to <8 x i8>*
+  %17 = load <8 x i8>, <8 x i8>* %16, align 1
+  %18 = sext <8 x i8> %17 to <8 x i16>
+  %add.ptr30 = getelementptr inbounds i8, i8* %ip_c3.0104, i32 8
+  %19 = add <8 x i16> %.splat, %18
+  %20 = tail call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %acc_0.0100, <8 x i16> %3, <8 x i16> %7)
+  %21 = tail call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %acc_1.0101, <8 x i16> %3, <8 x i16> %11)
+  %22 = tail call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %acc_2.0102, <8 x i16> %3, <8 x i16> %15)
+  %23 = tail call i32 @llvm.arm.mve.vmldava.v8i16(i32 0, i32 0, i32 0, i32 %acc_3.0103, <8 x i16> %3, <8 x i16> %19)
+  %inc = add nsw i32 %i_row_loop.1105, 1
+  %exitcond = icmp eq i32 %inc, %div
+  br i1 %exitcond, label %for.cond.cleanup23, label %for.body24
+
+if.end:                                           ; preds = %for.cond.cleanup23, %for.cond.preheader, %entry
+  ret i8* %out
+}
+
+
+; Similar but predicated
+define i32 @arm_nn_mat_mul_core_4x_s8(i32 %row_elements, i32 %offset, i8* %row_base, i8* %col_base, i32* nocapture readnone %sum_col, i32* nocapture %output) {
+; CHECK-LABEL: arm_nn_mat_mul_core_4x_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r10, lr}
+; CHECK-NEXT:    add.w r7, r0, #15
+; CHECK-NEXT:    ldr.w r12, [sp, #32]
+; CHECK-NEXT:    mov.w lr, #1
+; CHECK-NEXT:    asrs r6, r7, #31
+; CHECK-NEXT:    add.w r4, r7, r6, lsr #28
+; CHECK-NEXT:    asrs r5, r4, #4
+; CHECK-NEXT:    cmp r5, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    asrgt.w lr, r4, #4
+; CHECK-NEXT:    cmp r0, #1
+; CHECK-NEXT:    blt .LBB4_3
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    adds r5, r2, r1
+; CHECK-NEXT:    add.w r7, r2, r1, lsl #1
+; CHECK-NEXT:    add.w r1, r1, r1, lsl #1
+; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    add r1, r2
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    dlstp.8 lr, r0
+; CHECK-NEXT:  .LBB4_2: @ %for.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r3], #16
+; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
+; CHECK-NEXT:    vmlava.s8 r10, q1, q0
+; CHECK-NEXT:    vldrb.u8 q1, [r7], #16
+; CHECK-NEXT:    vmlava.s8 r4, q1, q0
+; CHECK-NEXT:    vldrb.u8 q1, [r5], #16
+; CHECK-NEXT:    vmlava.s8 r6, q1, q0
+; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
+; CHECK-NEXT:    vmlava.s8 r8, q1, q0
+; CHECK-NEXT:    letp lr, .LBB4_2
+; CHECK-NEXT:    b .LBB4_4
+; CHECK-NEXT:  .LBB4_3:
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:  .LBB4_4: @ %for.cond.cleanup
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    strd r8, r6, [r12]
+; CHECK-NEXT:    strd r4, r10, [r12, #8]
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r10, pc}
+entry:
+  %add = add nsw i32 %row_elements, 15
+  %div = sdiv i32 %add, 16
+  %cmp84 = icmp sgt i32 %row_elements, 0
+  br i1 %cmp84, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %mul2 = mul nsw i32 %offset, 3
+  %add.ptr3 = getelementptr inbounds i8, i8* %row_base, i32 %mul2
+  %mul = shl nsw i32 %offset, 1
+  %add.ptr1 = getelementptr inbounds i8, i8* %row_base, i32 %mul
+  %add.ptr = getelementptr inbounds i8, i8* %row_base, i32 %offset
+  %0 = icmp sgt i32 %div, 1
+  %smax = select i1 %0, i32 %div, i32 1
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %acc_n.sroa.12.0.lcssa = phi i32 [ 0, %entry ], [ %15, %for.body ]
+  %acc_n.sroa.9.0.lcssa = phi i32 [ 0, %entry ], [ %12, %for.body ]
+  %acc_n.sroa.6.0.lcssa = phi i32 [ 0, %entry ], [ %9, %for.body ]
+  %acc_n.sroa.0.0.lcssa = phi i32 [ 0, %entry ], [ %6, %for.body ]
+  store i32 %acc_n.sroa.0.0.lcssa, i32* %output, align 4
+  %arrayidx19 = getelementptr inbounds i32, i32* %output, i32 1
+  store i32 %acc_n.sroa.6.0.lcssa, i32* %arrayidx19, align 4
+  %arrayidx21 = getelementptr inbounds i32, i32* %output, i32 2
+  store i32 %acc_n.sroa.9.0.lcssa, i32* %arrayidx21, align 4
+  %arrayidx23 = getelementptr inbounds i32, i32* %output, i32 3
+  store i32 %acc_n.sroa.12.0.lcssa, i32* %arrayidx23, align 4
+  ret i32 0
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %col_base.addr.095 = phi i8* [ %add.ptr4, %for.body ], [ %col_base, %for.body.preheader ]
+  %acc_n.sroa.0.094 = phi i32 [ %6, %for.body ], [ 0, %for.body.preheader ]
+  %acc_n.sroa.6.093 = phi i32 [ %9, %for.body ], [ 0, %for.body.preheader ]
+  %acc_n.sroa.9.092 = phi i32 [ %12, %for.body ], [ 0, %for.body.preheader ]
+  %i.091 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %row_elem.090 = phi i32 [ %sub, %for.body ], [ %row_elements, %for.body.preheader ]
+  %acc_n.sroa.12.089 = phi i32 [ %15, %for.body ], [ 0, %for.body.preheader ]
+  %ip_row_3.088 = phi i8* [ %add.ptr15, %for.body ], [ %add.ptr3, %for.body.preheader ]
+  %ip_row_2.087 = phi i8* [ %add.ptr14, %for.body ], [ %add.ptr1, %for.body.preheader ]
+  %ip_row_1.086 = phi i8* [ %add.ptr13, %for.body ], [ %add.ptr, %for.body.preheader ]
+  %ip_row_0.085 = phi i8* [ %add.ptr12, %for.body ], [ %row_base, %for.body.preheader ]
+  %1 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %row_elem.090)
+  %sub = add nsw i32 %row_elem.090, -16
+  %2 = bitcast i8* %col_base.addr.095 to <16 x i8>*
+  %3 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> zeroinitializer)
+  %add.ptr4 = getelementptr inbounds i8, i8* %col_base.addr.095, i32 16
+  %4 = bitcast i8* %ip_row_0.085 to <16 x i8>*
+  %5 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %1, <16 x i8> zeroinitializer)
+  %6 = tail call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 %acc_n.sroa.0.094, <16 x i8> %5, <16 x i8> %3, <16 x i1> %1)
+  %7 = bitcast i8* %ip_row_1.086 to <16 x i8>*
+  %8 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %7, i32 1, <16 x i1> %1, <16 x i8> zeroinitializer)
+  %9 = tail call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 %acc_n.sroa.6.093, <16 x i8> %8, <16 x i8> %3, <16 x i1> %1)
+  %10 = bitcast i8* %ip_row_2.087 to <16 x i8>*
+  %11 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %10, i32 1, <16 x i1> %1, <16 x i8> zeroinitializer)
+  %12 = tail call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 %acc_n.sroa.9.092, <16 x i8> %11, <16 x i8> %3, <16 x i1> %1)
+  %13 = bitcast i8* %ip_row_3.088 to <16 x i8>*
+  %14 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %13, i32 1, <16 x i1> %1, <16 x i8> zeroinitializer)
+  %15 = tail call i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32 0, i32 0, i32 0, i32 %acc_n.sroa.12.089, <16 x i8> %14, <16 x i8> %3, <16 x i1> %1)
+  %add.ptr12 = getelementptr inbounds i8, i8* %ip_row_0.085, i32 16
+  %add.ptr13 = getelementptr inbounds i8, i8* %ip_row_1.086, i32 16
+  %add.ptr14 = getelementptr inbounds i8, i8* %ip_row_2.087, i32 16
+  %add.ptr15 = getelementptr inbounds i8, i8* %ip_row_3.088, i32 16
+  %inc = add nuw nsw i32 %i.091, 1
+  %exitcond = icmp eq i32 %inc, %smax
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #1
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #2
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
+declare i32 @llvm.experimental.vector.reduce.add.v16i8(<16 x i32> %ext4)
+declare i32 @llvm.arm.mve.vmldava.v8i16(i32, i32, i32, i32, <8 x i16>, <8 x i16>)
+declare i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32, i32, i32, i32, <16 x i8>, <16 x i8>, <16 x i1>)
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index 793b4e39284fe9..76b5f6ff228fe2 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -93,6 +93,65 @@ define i32 @length2(i8* %X, i8* %Y) nounwind {
   ret i32 %m
 }
 
+define i32 @length2_const(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    movzwl .L.str+1, %ecx
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    movzwl %cx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl .L.str+{{.*}}(%rip), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_gt_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    movzwl .L.str+1, %ecx
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    movzwl %cx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_gt_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl .L.str+{{.*}}(%rip), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setg %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
 define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length2_eq:
 ; X86:       # %bb.0:
@@ -238,14 +297,14 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    rolw $8, %dx
 ; X86-NEXT:    rolw $8, %si
 ; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:    jne .LBB11_1
 ; X86-NEXT:  # %bb.2: # %loadbb1
 ; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:  .LBB11_1: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
@@ -259,13 +318,13 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    cmpw %cx, %ax
-; X64-NEXT:    jne .LBB9_1
+; X64-NEXT:    jne .LBB11_1
 ; X64-NEXT:  # %bb.2: # %loadbb1
 ; X64-NEXT:    movzbl 2(%rdi), %eax
 ; X64-NEXT:    movzbl 2(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:  .LBB11_1: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
@@ -453,14 +512,14 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB16_1
+; X86-NEXT:    jne .LBB18_1
 ; X86-NEXT:  # %bb.2: # %loadbb1
 ; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB16_1: # %res_block
+; X86-NEXT:  .LBB18_1: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
@@ -474,13 +533,13 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB16_1
+; X64-NEXT:    jne .LBB18_1
 ; X64-NEXT:  # %bb.2: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB16_1: # %res_block
+; X64-NEXT:  .LBB18_1: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
@@ -529,17 +588,17 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_1
+; X86-NEXT:    jne .LBB20_1
 ; X86-NEXT:  # %bb.2: # %loadbb1
 ; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB18_3
-; X86-NEXT:  .LBB18_1: # %res_block
+; X86-NEXT:    jmp .LBB20_3
+; X86-NEXT:  .LBB20_1: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB18_3: # %endblock
+; X86-NEXT:  .LBB20_3: # %endblock
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
@@ -552,7 +611,7 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB18_1
+; X64-NEXT:    jne .LBB20_1
 ; X64-NEXT:  # %bb.2: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
@@ -560,7 +619,7 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_1: # %res_block
+; X64-NEXT:  .LBB20_1: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
@@ -610,7 +669,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB20_2
+; X86-NEXT:    jne .LBB22_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 4(%esi), %ecx
 ; X86-NEXT:    movl 4(%eax), %edx
@@ -618,13 +677,13 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB20_3
-; X86-NEXT:  .LBB20_2: # %res_block
+; X86-NEXT:    je .LBB22_3
+; X86-NEXT:  .LBB22_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB20_3: # %endblock
+; X86-NEXT:  .LBB22_3: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -818,7 +877,7 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB27_2
+; X64-NEXT:    jne .LBB29_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
@@ -826,13 +885,13 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB27_3
-; X64-NEXT:  .LBB27_2: # %res_block
+; X64-NEXT:    je .LBB29_3
+; X64-NEXT:  .LBB29_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB27_3: # %endblock
+; X64-NEXT:  .LBB29_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   ret i32 %m
@@ -892,6 +951,26 @@ define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
   ret i1 %c
 }
 
+define i32 @length15_const(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length15_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl $.L.str+1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length15_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $.L.str+1, %esi
+; X64-NEXT:    movl $15, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i64 15) nounwind
+  ret i32 %m
+}
+
 define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length15_eq:
 ; X86:       # %bb.0:
@@ -919,6 +998,34 @@ define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
   ret i1 %c
 }
 
+define i1 @length15_gt_const(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length15_gt_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl $.L.str+1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length15_gt_const:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $.L.str+1, %esi
+; X64-NEXT:    movl $15, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setg %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i64 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
 ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
 
 define i32 @length16(i8* %X, i8* %Y) nounwind {
@@ -939,7 +1046,7 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB31_2
+; X64-NEXT:    jne .LBB35_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rcx
 ; X64-NEXT:    movq 8(%rsi), %rdx
@@ -947,13 +1054,13 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB31_3
-; X64-NEXT:  .LBB31_2: # %res_block
+; X64-NEXT:    je .LBB35_3
+; X64-NEXT:  .LBB35_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB31_3: # %endblock
+; X64-NEXT:  .LBB35_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
   ret i32 %m
@@ -1068,7 +1175,7 @@ define i1 @length16_lt(i8* %x, i8* %y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB33_2
+; X64-NEXT:    jne .LBB37_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rcx
 ; X64-NEXT:    movq 8(%rsi), %rdx
@@ -1076,13 +1183,13 @@ define i1 @length16_lt(i8* %x, i8* %y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB33_3
-; X64-NEXT:  .LBB33_2: # %res_block
+; X64-NEXT:    je .LBB37_3
+; X64-NEXT:  .LBB37_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB33_3: # %endblock
+; X64-NEXT:  .LBB37_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -1111,7 +1218,7 @@ define i1 @length16_gt(i8* %x, i8* %y) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB34_2
+; X64-NEXT:    jne .LBB38_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rax
 ; X64-NEXT:    movq 8(%rsi), %rcx
@@ -1119,13 +1226,13 @@ define i1 @length16_gt(i8* %x, i8* %y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB34_3
-; X64-NEXT:  .LBB34_2: # %res_block
+; X64-NEXT:    je .LBB38_3
+; X64-NEXT:  .LBB38_2: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    setae %dl
 ; X64-NEXT:    leal -1(%rdx,%rdx), %edx
-; X64-NEXT:  .LBB34_3: # %endblock
+; X64-NEXT:  .LBB38_3: # %endblock
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
diff --git a/llvm/test/DebugInfo/X86/debug-macinfo-split-dwarf.ll b/llvm/test/DebugInfo/X86/debug-macinfo-split-dwarf.ll
index e75f138354e9ae..f7cbff4013ca77 100644
--- a/llvm/test/DebugInfo/X86/debug-macinfo-split-dwarf.ll
+++ b/llvm/test/DebugInfo/X86/debug-macinfo-split-dwarf.ll
@@ -8,6 +8,7 @@
 ; CHECK:       DW_AT_macro_info  (0x00000000)
 
 ;CHECK-LABEL:.debug_macinfo.dwo contents:
+;CHECK-NEXT: 0x00000000:
 ;CHECK-NEXT:  DW_MACINFO_start_file - lineno: 0 filenum: 1
 ;CHECK-NEXT:    DW_MACINFO_start_file - lineno: 1 filenum: 2
 ;CHECK-NEXT:      DW_MACINFO_define - lineno: 1 macro: define_1 12
diff --git a/llvm/test/DebugInfo/X86/debug-macro.ll b/llvm/test/DebugInfo/X86/debug-macro.ll
index 6a5ae1e7021631..fbcfab6610d5e5 100644
--- a/llvm/test/DebugInfo/X86/debug-macro.ll
+++ b/llvm/test/DebugInfo/X86/debug-macro.ll
@@ -16,6 +16,7 @@
 ; CHECK-NOT: DW_AT_macro_info
 
 ; CHECK-LABEL:     .debug_macinfo contents:
+; CHECK-NEXT: 0x00000000:
 ; CHECK-NEXT: DW_MACINFO_define - lineno: 0 macro: NameCMD ValueCMD
 ; CHECK-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
 ; CHECK-NEXT:   DW_MACINFO_start_file - lineno: 9 filenum: 2
@@ -24,8 +25,9 @@
 ; CHECK-NEXT:   DW_MACINFO_end_file
 ; CHECK-NEXT:   DW_MACINFO_undef - lineno: 10 macro: NameUndef2
 ; CHECK-NEXT: DW_MACINFO_end_file
-
-; CHECK: DW_MACINFO_start_file - lineno: 0 filenum: 1
+; CHECK-EMPTY:
+; CHECK-NEXT: 0x00000045:
+; CHECK-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
 ; CHECK-NEXT: DW_MACINFO_end_file
 
 ; CHECK-LABEL: .debug_line contents:
diff --git a/llvm/test/DebugInfo/duplicate_dbgvalue.ll b/llvm/test/DebugInfo/duplicate_dbgvalue.ll
new file mode 100644
index 00000000000000..2145b6ef5c83bf
--- /dev/null
+++ b/llvm/test/DebugInfo/duplicate_dbgvalue.ll
@@ -0,0 +1,169 @@
+; RUN: opt -instcombine -S -o - < %s | FileCheck %s
+
+; CHECK-LABEL: %4 = load i32, i32* %i1_311
+; CHECK: call void @llvm.dbg.value(metadata i32 %4
+; Next instruction should not be duplicate dbg.value intrinsic.
+; CHECK-NEXT: @f90io_sc_i_ldw
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;program main
+;integer :: res
+; res = mfun()
+; print *, res
+;contains
+; function mfun()
+;  integer :: i1
+;  i1 = 5
+;  mfun = fun(i1)
+;  write (*,*) i1
+; end function
+; function fun(a)
+;    integer, intent (in) :: a
+;    fun = a
+; end function
+;end program main
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; ModuleID = 'duplicate_dbgvalue.ll'
+source_filename = "duplicate_dbgvalue.ll"
+target datalayout = "e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.BSS1 = type <{ [4 x i8] }>
+
+@.BSS1 = internal unnamed_addr global %struct.BSS1 zeroinitializer, align 32, !dbg !0
+@.C303_MAIN_ = internal constant i32 6
+@.C300_MAIN_ = internal constant [22 x i8] c"duplicate_dbgvalue.f90"
+@.C302_MAIN_ = internal constant i32 4
+@.C283_MAIN_ = internal constant i32 0
+@.C283_main_mfun = internal constant i32 0
+@.C302_main_mfun = internal constant i32 6
+@.C300_main_mfun = internal constant [22 x i8] c"duplicate_dbgvalue.f90"
+@.C313_main_mfun = internal constant i32 10
+
+define void @MAIN_() local_unnamed_addr !dbg !2 {
+L.entry:
+  call void (i8*, ...) bitcast (void (...)* @fort_init to void (i8*, ...)*)(i8* bitcast (i32* @.C283_MAIN_ to i8*)), !dbg !16
+  %0 = call fastcc i32 @main_mfun(), !dbg !18
+  store i32 %0, i32* bitcast (%struct.BSS1* @.BSS1 to i32*), align 32, !dbg !18
+  call void (i8*, i8*, i64, ...) bitcast (void (...)* @f90io_src_info03a to void (i8*, i8*, i64, ...)*)(i8* bitcast (i32* @.C302_MAIN_ to i8*), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.C300_MAIN_, i64 0, i64 0), i64 22), !dbg !23
+  %1 = call i32 (i8*, i8*, i8*, i8*, ...) bitcast (i32 (...)* @f90io_print_init to i32 (i8*, i8*, i8*, i8*, ...)*)(i8* bitcast (i32* @.C303_MAIN_ to i8*), i8* null, i8* bitcast (i32* @.C283_MAIN_ to i8*), i8* bitcast (i32* @.C283_MAIN_ to i8*)), !dbg !23
+  call void @llvm.dbg.value(metadata i32 %1, metadata !24, metadata !DIExpression()), !dbg !25
+  %2 = load i32, i32* bitcast (%struct.BSS1* @.BSS1 to i32*), align 32, !dbg !23
+  %3 = call i32 (i32, i32, ...) bitcast (i32 (...)* @f90io_sc_i_ldw to i32 (i32, i32, ...)*)(i32 %2, i32 25), !dbg !23
+  call void @llvm.dbg.value(metadata i32 %3, metadata !24, metadata !DIExpression()), !dbg !25
+  %4 = call i32 (...) @f90io_ldw_end(), !dbg !23
+  call void @llvm.dbg.value(metadata i32 %4, metadata !24, metadata !DIExpression()), !dbg !25
+  ret void, !dbg !26
+}
+
+define internal fastcc signext i32 @main_mfun() unnamed_addr !dbg !27 {
+L.entry:
+  %i1_311 = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata i64* undef, metadata !31, metadata !DIExpression()), !dbg !33
+  call void @llvm.dbg.declare(metadata i32* %i1_311, metadata !35, metadata !DIExpression()), !dbg !33
+  store i32 5, i32* %i1_311, align 4, !dbg !36
+  %0 = bitcast i32* %i1_311 to i64*, !dbg !41
+  %1 = call fastcc float @main_fun(i64* %0), !dbg !41
+  %2 = fptosi float %1 to i32, !dbg !41
+  call void (i8*, i8*, i64, ...) bitcast (void (...)* @f90io_src_info03a to void (i8*, i8*, i64, ...)*)(i8* bitcast (i32* @.C313_main_mfun to i8*), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.C300_main_mfun, i32 0, i32 0), i64 22), !dbg !42
+  %3 = call i32 (i8*, i8*, i8*, i8*, ...) bitcast (i32 (...)* @f90io_print_init to i32 (i8*, i8*, i8*, i8*, ...)*)(i8* bitcast (i32* @.C302_main_mfun to i8*), i8* null, i8* bitcast (i32* @.C283_main_mfun to i8*), i8* bitcast (i32* @.C283_main_mfun to i8*)), !dbg !42
+  call void @llvm.dbg.value(metadata i32 %3, metadata !43, metadata !DIExpression()), !dbg !33
+  %4 = load i32, i32* %i1_311, align 4, !dbg !42
+  call void @llvm.dbg.value(metadata i32 %4, metadata !35, metadata !DIExpression()), !dbg !33
+  %5 = call i32 (i32, i32, ...) bitcast (i32 (...)* @f90io_sc_i_ldw to i32 (i32, i32, ...)*)(i32 %4, i32 25), !dbg !42
+  call void @llvm.dbg.value(metadata i32 %5, metadata !43, metadata !DIExpression()), !dbg !33
+  %6 = call i32 (...) @f90io_ldw_end(), !dbg !42
+  call void @llvm.dbg.value(metadata i32 %6, metadata !43, metadata !DIExpression()), !dbg !33
+  ret i32 %2, !dbg !44
+}
+
+define internal fastcc float @main_fun(i64* noalias %a) unnamed_addr !dbg !45 {
+L.entry:
+  call void @llvm.dbg.declare(metadata i64* %a, metadata !50, metadata !DIExpression()), !dbg !51
+  call void @llvm.dbg.declare(metadata i64* undef, metadata !53, metadata !DIExpression()), !dbg !51
+  %0 = bitcast i64* %a to i32*, !dbg !54
+  %1 = load i32, i32* %0, align 4, !dbg !54
+  %2 = sitofp i32 %1 to float, !dbg !54
+  ret float %2, !dbg !59
+}
+
+declare signext i32 @f90io_ldw_end(...) local_unnamed_addr
+
+declare signext i32 @f90io_sc_i_ldw(...) local_unnamed_addr
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+declare signext i32 @f90io_print_init(...) local_unnamed_addr
+
+declare void @f90io_src_info03a(...) local_unnamed_addr
+
+declare void @fort_init(...) local_unnamed_addr
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.module.flags = !{!14, !15}
+!llvm.dbg.cu = !{!4}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "res", scope: !2, file: !3, type: !9, isLocal: true, isDefinition: true)
+!2 = distinct !DISubprogram(name: "main", scope: !4, file: !3, line: 1, type: !12, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagMainSubprogram, unit: !4)
+!3 = !DIFile(filename: "duplicate-dbgvalue.f90", directory: "/dir")
+!4 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !3, producer: " F90 Flang - 1.5", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !5, retainedTypes: !5, globals: !6, imports: !5)
+!5 = !{}
+!6 = !{!0, !7, !10}
+!7 = !DIGlobalVariableExpression(var: !8, expr: !DIExpression())
+!8 = distinct !DIGlobalVariable(name: "res", scope: !4, file: !3, type: !9, isLocal: true, isDefinition: true)
+!9 = !DIBasicType(name: "integer", size: 32, align: 32, encoding: DW_ATE_signed)
+!10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
+!11 = distinct !DIGlobalVariable(name: "res", scope: !4, file: !3, type: !9, isLocal: true, isDefinition: true)
+!12 = !DISubroutineType(cc: DW_CC_program, types: !13)
+!13 = !{null}
+!14 = !{i32 2, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !DILocation(line: 1, column: 1, scope: !17)
+!17 = !DILexicalBlock(scope: !2, file: !3, line: 1, column: 1)
+!18 = !DILocation(line: 3, column: 1, scope: !17)
+!19 = !{!20, !20, i64 0}
+!20 = !{!"t1.2", !21, i64 0}
+!21 = !{!"unlimited ptr", !22, i64 0}
+!22 = !{!"Flang FAA 1"}
+!23 = !DILocation(line: 4, column: 1, scope: !17)
+!24 = !DILocalVariable(scope: !17, file: !3, type: !9, flags: DIFlagArtificial)
+!25 = !DILocation(line: 0, scope: !17)
+!26 = !DILocation(line: 5, column: 1, scope: !17)
+!27 = distinct !DISubprogram(name: "mfun", scope: !2, file: !3, line: 6, type: !28, scopeLine: 6, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !4)
+!28 = !DISubroutineType(types: !29)
+!29 = !{!30}
+!30 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!31 = !DILocalVariable(arg: 1, scope: !27, file: !3, type: !32, flags: DIFlagArtificial)
+!32 = !DIBasicType(name: "uinteger*8", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!33 = !DILocation(line: 0, scope: !34)
+!34 = !DILexicalBlock(scope: !27, file: !3, line: 6, column: 1)
+!35 = !DILocalVariable(name: "i1", scope: !34, file: !3, type: !9)
+!36 = !DILocation(line: 8, column: 1, scope: !34)
+!37 = !{!38, !38, i64 0}
+!38 = !{!"t2.2", !39, i64 0}
+!39 = !{!"unlimited ptr", !40, i64 0}
+!40 = !{!"Flang FAA 2"}
+!41 = !DILocation(line: 9, column: 1, scope: !34)
+!42 = !DILocation(line: 10, column: 1, scope: !34)
+!43 = !DILocalVariable(scope: !34, file: !3, type: !9, flags: DIFlagArtificial)
+!44 = !DILocation(line: 11, column: 1, scope: !34)
+!45 = distinct !DISubprogram(name: "fun", scope: !2, file: !3, line: 12, type: !46, scopeLine: 12, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !4)
+!46 = !DISubroutineType(types: !47)
+!47 = !{!48, !9}
+!48 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !49, size: 64, align: 64)
+!49 = !DIBasicType(name: "real", size: 32, align: 32, encoding: DW_ATE_float)
+!50 = !DILocalVariable(name: "a", arg: 1, scope: !45, file: !3, type: !9)
+!51 = !DILocation(line: 0, scope: !52)
+!52 = !DILexicalBlock(scope: !45, file: !3, line: 12, column: 1)
+!53 = !DILocalVariable(arg: 2, scope: !45, file: !3, type: !32, flags: DIFlagArtificial)
+!54 = !DILocation(line: 14, column: 1, scope: !52)
+!55 = !{!56, !56, i64 0}
+!56 = !{!"t3.2", !57, i64 0}
+!57 = !{!"unlimited ptr", !58, i64 0}
+!58 = !{!"Flang FAA 3"}
+!59 = !DILocation(line: 15, column: 1, scope: !52)
diff --git a/llvm/test/MC/X86/align-branch-64-system.s b/llvm/test/MC/X86/align-branch-64-system.s
new file mode 100644
index 00000000000000..b62a4e3e136f8c
--- /dev/null
+++ b/llvm/test/MC/X86/align-branch-64-system.s
@@ -0,0 +1,68 @@
+  # RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu --x86-align-branch-boundary=32 --x86-align-branch=jmp %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
+
+  # Exercise cases where we're enabling interrupts with one instruction delay
+  # and thus can't add a nop in between without changing behavior.
+
+  .text
+
+  # CHECK: 1e:       sti
+  # CHECK: 1f:       jmp
+  .p2align  5
+  .rept 30
+  int3
+  .endr
+  sti
+  jmp baz
+
+  # CHECK: 5c:       movq %rax, %ss
+  # CHECK: 5f:       jmp
+  .p2align  5
+  .rept 28
+  int3
+  .endr
+  movq %rax, %ss
+  jmp baz
+
+  # CHECK: 9d:       movl %esi, %ss
+  # CHECK: 9f:       jmp
+  .p2align  5
+  .rept 29
+  int3
+  .endr
+  movl %esi, %ss
+  jmp baz
+
+  # movw and movl are interchangeable since we're only using the low 16 bits.
+  # Both are generated as "MOV Sreg,r/m16**" (8E /r), but disassembled as movl
+  # CHECK: dd:       movl %esi, %ss
+  # CHECK: df:       jmp
+  .p2align  5
+  .rept 29
+  int3
+  .endr
+  movw %si, %ss
+  jmp baz
+
+  # CHECK: 11b:       movw (%esi), %ss
+  # CHECK: 11e:       jmp
+  .p2align  5
+  .rept 27
+  int3
+  .endr
+  movw (%esi), %ss
+  jmp baz
+
+  # CHECK: 15b:      	movw	(%rsi), %ss
+  # CHECK: 15d:     	jmp
+  .p2align  5
+  .rept 27
+  int3
+  .endr
+  movw (%rsi), %ss
+  jmp baz
+
+
+  int3
+  .section ".text.other"
+bar:
+  retq
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index bab23c924d64b8..4bfee73720f746 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -92,7 +92,6 @@
 ; CHECK-O2-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O2-NEXT: Running pass: CrossDSOCFIPass
 ; CHECK-O2-NEXT: Running pass: LowerTypeTestsPass
-; CHECK-O-NEXT: Running pass: LowerTypeTestsPass
 ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}SimplifyCFGPass>
 ; CHECK-O2-NEXT: Running pass: EliminateAvailableExternallyPass
 ; CHECK-O2-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll
index 6326bec87a59f4..a6faeccb30dd19 100644
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -80,7 +80,6 @@
 ; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
 ; CHECK-O-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-O-NEXT: Running pass: AttributorPass
-; CHECK-POSTLINK-O-NEXT: Running pass: LowerTypeTestsPass
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 9c5fdc6458a54b..0c2e9328c11a4c 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -49,7 +49,6 @@
 ; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
 ; CHECK-O-NEXT: Finished {{.*}}Function pass manager run.
 ; CHECK-O-NEXT: Running pass: AttributorPass
-; CHECK-O-NEXT: Running pass: LowerTypeTestsPass
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 01d951703a6b67..29d379ab54ad79 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -60,7 +60,6 @@
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: PGOIndirectCallPromotion
 ; CHECK-O-NEXT: Running pass: AttributorPass
-; CHECK-O-NEXT: Running pass: LowerTypeTestsPass
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
diff --git a/llvm/test/ThinLTO/X86/Inputs/cfi-unsat.ll b/llvm/test/ThinLTO/X86/Inputs/cfi-unsat.ll
deleted file mode 100644
index bc7a0e36dfa3b5..00000000000000
--- a/llvm/test/ThinLTO/X86/Inputs/cfi-unsat.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-grtev4-linux-gnu"
-
-%struct.A = type { i32 (...)** }
-%struct.B = type { i32 (...)** }
-
-@_ZTV1B = linkonce_odr constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* undef, i8* bitcast (i32 (%struct.B*, i32)* @_ZN1B1fEi to i8*), i8* bitcast (i32 (%struct.B*, i32)* @_ZN1B1nEi to i8*)] }, !type !0
-
-$test = comdat any
-
-; CHECK-IR-LABEL: define i32 @test
-define linkonce_odr i32 @test(%struct.A* %obj, i32 %a) comdat {
-entry:
-  %0 = bitcast %struct.A* %obj to i8**
-  %vtable5 = load i8*, i8** %0
-
-  %1 = tail call { i8*, i1 } @llvm.type.checked.load(i8* %vtable5, i32 8, metadata !"_ZTS1A")
-  %2 = extractvalue { i8*, i1 } %1, 1
-  br i1 %2, label %cont, label %trap
-
-trap:
-  tail call void @llvm.trap()
-  unreachable
-
-cont:
-  %3 = extractvalue { i8*, i1 } %1, 0
-  %4 = bitcast i8* %3 to i32 (%struct.A*, i32)*
-
-  ; Check that the call was devirtualized.
-  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
-  %call = tail call i32 %4(%struct.A* nonnull %obj, i32 %a)
-
-  ret i32 %call
-}
-; CHECK-IR-LABEL: ret i32
-; CHECK-IR-LABEL: }
-
-declare { i8*, i1 } @llvm.type.checked.load(i8*, i32, metadata)
-declare void @llvm.trap()
-
-define internal i32 @_ZN1B1fEi(%struct.B* %this, i32 %a) {
-entry:
-   ret i32 0
-}
-define internal i32 @_ZN1B1nEi(%struct.B* %this, i32 %a) {
-entry:
-   ret i32 0
-}
-
-!0 = !{i64 16, !"_ZTS1B"}
diff --git a/llvm/test/ThinLTO/X86/cfi-unsat.ll b/llvm/test/ThinLTO/X86/cfi-unsat.ll
deleted file mode 100644
index 24e837303c2ade..00000000000000
--- a/llvm/test/ThinLTO/X86/cfi-unsat.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; REQUIRES: x86-registered-target
-
-; Test CFI devirtualization through the thin link and backend when
-; a type id is Unsat (not used on any global's type metadata).
-;
-; In this test case, the first module is split and will import a resolution
-; for its type test. The resolution would be exported by the second
-; module, which is set up so that it does not get split (treated as regular
-; LTO because it does not have any external globals from which to create
-; a unique module ID). We should not actually get any resolution for the
-; type id in this case, since no globals include it in their type metadata,
-; so the resolution is Unsat and the type.checked.load instructions are
-; converted to type tests that evaluate to false.
-
-; RUN: opt -thinlto-bc -thinlto-split-lto-unit -o %t.o %s
-; RUN: opt -thinlto-bc -thinlto-split-lto-unit -o %t1.o %p/Inputs/cfi-unsat.ll
-
-; RUN: llvm-lto2 run %t.o %t1.o -save-temps -use-new-pm -pass-remarks=. \
-; RUN:   -whole-program-visibility \
-; RUN:   -o %t3 \
-; RUN:   -r=%t.o,test2,px \
-; RUN:   -r=%t1.o,_ZTV1B,px \
-; RUN:   -r=%t1.o,test,px
-; RUN: llvm-dis %t3.index.bc -o - | FileCheck %s --check-prefix=INDEX
-; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR0
-; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR1
-
-; INDEX-NOT: "typeid:"
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-grtev4-linux-gnu"
-
-%struct.A = type { i32 (...)** }
-
-$test2 = comdat any
-
-; CHECK-IR0: define weak_odr i32 @test
-define linkonce_odr i32 @test2(%struct.A* %obj, i32 %a) comdat {
-; CHECK-IR0-NEXT: entry:
-entry:
-; CHECK-IR0-NEXT: %0 = bitcast
-  %0 = bitcast %struct.A* %obj to i8**
-; CHECK-IR0-NEXT: %vtable5 =
-  %vtable5 = load i8*, i8** %0
-
-; CHECK-IR0-NEXT: tail call void @llvm.trap()
-; CHECK-IR0-NEXT: unreachable
-
-  %1 = tail call { i8*, i1 } @llvm.type.checked.load(i8* %vtable5, i32 8, metadata !"_ZTS1A")
-  %2 = extractvalue { i8*, i1 } %1, 1
-  br i1 %2, label %cont, label %trap
-
-trap:
-  tail call void @llvm.trap()
-  unreachable
-
-cont:
-  %3 = extractvalue { i8*, i1 } %1, 0
-  %4 = bitcast i8* %3 to i32 (%struct.A*, i32)*
-
-  %call = tail call i32 %4(%struct.A* nonnull %obj, i32 %a)
-
-  ret i32 %call
-; CHECK-IR0-NEXT: }
-}
-
-; CHECK-IR1: define weak_odr i32 @test2
-; CHECK-IR1-NEXT:   entry:
-; CHECK-IR1-NEXT:     tail call void @llvm.trap()
-; CHECK-IR1-NEXT:     unreachable
-; CHECK-IR1-NEXT:   }
-
-declare { i8*, i1 } @llvm.type.checked.load(i8*, i32, metadata)
-declare void @llvm.trap()
diff --git a/llvm/test/Transforms/InstCombine/multi-use-load-casts.ll b/llvm/test/Transforms/InstCombine/multi-use-load-casts.ll
new file mode 100644
index 00000000000000..147d893e285eea
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/multi-use-load-casts.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+; Positive test - all uses are identical casts.
+define void @t0(i1 zeroext %c0, i1 zeroext %c1, i64* nocapture readonly %src) {
+; CHECK-LABEL: @t0(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[DATA:%.*]] = load i64, i64* [[SRC:%.*]], align 8
+; CHECK-NEXT:    br i1 [[C0:%.*]], label [[BB3:%.*]], label [[BB7:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[BB4:%.*]], label [[BB5:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    tail call void @abort()
+; CHECK-NEXT:    unreachable
+; CHECK:       bb5:
+; CHECK-NEXT:    [[PTR0:%.*]] = inttoptr i64 [[DATA]] to i32*
+; CHECK-NEXT:    tail call void @sink0(i32* [[PTR0]])
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[PTR1:%.*]] = inttoptr i64 [[DATA]] to i32*
+; CHECK-NEXT:    tail call void @sink1(i32* [[PTR1]])
+; CHECK-NEXT:    br label [[BB9]]
+; CHECK:       bb9:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %data = load i64, i64* %src, align 8
+  br i1 %c0, label %bb3, label %bb7
+
+bb3:
+  br i1 %c1, label %bb4, label %bb5
+
+bb4:
+  tail call void @abort()
+  unreachable
+
+bb5:
+  %ptr0 = inttoptr i64 %data to i32*
+  tail call void @sink0(i32* %ptr0)
+  br label %bb9
+
+bb7:
+  %ptr1 = inttoptr i64 %data to i32*
+  tail call void @sink1(i32* %ptr1)
+  br label %bb9
+
+bb9:
+  ret void
+}
+
+; Negative test - all uses are casts, but non-identical ones.
+define void @n1(i1 zeroext %c0, i1 zeroext %c1, i64* nocapture readonly %src) {
+; CHECK-LABEL: @n1(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[DATA:%.*]] = load i64, i64* [[SRC:%.*]], align 8
+; CHECK-NEXT:    br i1 [[C0:%.*]], label [[BB3:%.*]], label [[BB7:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[BB4:%.*]], label [[BB5:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    tail call void @abort()
+; CHECK-NEXT:    unreachable
+; CHECK:       bb5:
+; CHECK-NEXT:    [[PTR0:%.*]] = inttoptr i64 [[DATA]] to i32*
+; CHECK-NEXT:    tail call void @sink0(i32* [[PTR0]])
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[VEC:%.*]] = bitcast i64 [[DATA]] to <2 x i32>
+; CHECK-NEXT:    tail call void @sink2(<2 x i32> [[VEC]])
+; CHECK-NEXT:    br label [[BB9]]
+; CHECK:       bb9:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %data = load i64, i64* %src, align 8
+  br i1 %c0, label %bb3, label %bb7
+
+bb3:
+  br i1 %c1, label %bb4, label %bb5
+
+bb4:
+  tail call void @abort()
+  unreachable
+
+bb5:
+  %ptr0 = inttoptr i64 %data to i32*
+  tail call void @sink0(i32* %ptr0)
+  br label %bb9
+
+bb7:
+  %vec = bitcast i64 %data to <2 x i32> ; different cast
+  tail call void @sink2(<2 x i32> %vec)
+  br label %bb9
+
+bb9:
+  ret void
+}
+
+; Negative test - have non-cast users.
+define void @n2(i1 zeroext %c0, i1 zeroext %c1, i64* nocapture readonly %src) {
+; CHECK-LABEL: @n2(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[DATA:%.*]] = load i64, i64* [[SRC:%.*]], align 8
+; CHECK-NEXT:    br i1 [[C0:%.*]], label [[BB3:%.*]], label [[BB7:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[BB4:%.*]], label [[BB5:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    tail call void @abort()
+; CHECK-NEXT:    unreachable
+; CHECK:       bb5:
+; CHECK-NEXT:    [[PTR0:%.*]] = inttoptr i64 [[DATA]] to i32*
+; CHECK-NEXT:    tail call void @sink0(i32* [[PTR0]])
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    tail call void @sink3(i64 [[DATA]])
+; CHECK-NEXT:    br label [[BB9]]
+; CHECK:       bb9:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %data = load i64, i64* %src, align 8
+  br i1 %c0, label %bb3, label %bb7
+
+bb3:
+  br i1 %c1, label %bb4, label %bb5
+
+bb4:
+  tail call void @abort()
+  unreachable
+
+bb5:
+  %ptr0 = inttoptr i64 %data to i32*
+  tail call void @sink0(i32* %ptr0)
+  br label %bb9
+
+bb7:
+  tail call void @sink3(i64 %data) ; non-cast use
+  br label %bb9
+
+bb9:
+  ret void
+}
+
+declare void @abort()
+
+declare void @sink0(i32*)
+
+declare void @sink1(i32*)
+
+declare void @sink2(<2 x i32>)
+
+declare void @sink3(i64)
diff --git a/llvm/test/Transforms/LICM/freeze.ll b/llvm/test/Transforms/LICM/freeze.ll
new file mode 100644
index 00000000000000..f17c270c97bec7
--- /dev/null
+++ b/llvm/test/Transforms/LICM/freeze.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -licm -S < %s | FileCheck %s
+
+define void @hoist(i1 %a) {
+; CHECK-LABEL: @hoist(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = freeze i1 [[A:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    call void @use(i1 [[B]])
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+loop:
+  %b = freeze i1 %a
+  call void @use(i1 %b)
+  br label %loop
+}
+
+define i1 @sink(i1 %a) {
+; CHECK-LABEL: @sink(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[FR_LE:%.*]] = freeze i1 [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[FR_LE]]
+;
+entry:
+  br label %loop
+loop:
+  %fr = freeze i1 %a
+  %c = call i1 @cond()
+  br i1 %c, label %loop, label %exit
+exit:
+  ret i1 %fr
+}
+
+declare i1 @cond()
+declare void @use(i1)
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-conditions.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-conditions.ll
index 5c84884c66dea0..f0fbf3d6d49b1d 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-conditions.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-conditions.ll
@@ -403,76 +403,11 @@ for.end:
   ret void
 }
 
-; In this case we cannot peel the inner loop, because the condition involves
-; the outer induction variable.
-define void @test5(i32 %k) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:  for.body.lr.ph:
-; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
-; CHECK:       outer.header:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH:%.*]] ], [ [[J_INC:%.*]], [[OUTER_INC:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[J]], 2
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    call void @f1()
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       if.else:
-; CHECK-NEXT:    call void @f2()
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_05]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[K:%.*]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[OUTER_INC]]
-; CHECK:       outer.inc:
-; CHECK-NEXT:    [[J_INC]] = add nsw i32 [[J]], 1
-; CHECK-NEXT:    [[OUTER_CMP:%.*]] = icmp slt i32 [[J_INC]], [[K]]
-; CHECK-NEXT:    br i1 [[OUTER_CMP]], label [[OUTER_HEADER]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-for.body.lr.ph:
-  br label %outer.header
-
-outer.header:
-  %j = phi i32 [ 0, %for.body.lr.ph ], [ %j.inc, %outer.inc ]
-  br label %for.body
-
-for.body:
-  %i.05 = phi i32 [ 0, %outer.header ], [ %inc, %for.inc ]
-  %cmp1 = icmp ult i32 %j, 2
-  br i1 %cmp1, label %if.then, label %if.else
-
-if.then:
-  call void @f1()
-  br label %for.inc
-
-if.else:
-  call void @f2()
-  br label %for.inc
-
-for.inc:
-  %inc = add nsw i32 %i.05, 1
-  %cmp = icmp slt i32 %inc, %k
-  br i1 %cmp, label %for.body, label %outer.inc
-
-outer.inc:
-  %j.inc = add nsw i32 %j, 1
-  %outer.cmp = icmp slt i32 %j.inc, %k
-  br i1 %outer.cmp, label %outer.header, label %for.end
-
-
-for.end:
-  ret void
-}
-
 ; In this test, the condition involves 2 AddRecs. Without evaluating both
 ; AddRecs, we cannot prove that the condition becomes known in the loop body
 ; after peeling.
-define void @test6(i32 %k) {
-; CHECK-LABEL: @test6(
+define void @test5(i32 %k) {
+; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -521,8 +456,8 @@ for.end:
   ret void
 }
 
-define void @test7(i32 %k) {
-; CHECK-LABEL: @test7(
+define void @test6(i32 %k) {
+; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  for.body.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY_PEEL_BEGIN:%.*]]
 ; CHECK:       for.body.peel.begin:
@@ -615,8 +550,8 @@ for.end:
   ret void
 }
 
-define void @test8(i32 %k) {
-; CHECK-LABEL: @test8(
+define void @test7(i32 %k) {
+; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  for.body.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY_PEEL_BEGIN:%.*]]
 ; CHECK:       for.body.peel.begin:
@@ -711,8 +646,8 @@ for.end:
 
 ; Comparison with non-monotonic predicate due to possible wrapping, loop
 ; body cannot be simplified.
-define void @test9(i32 %k) {
-; CHECK-LABEL: @test9(
+define void @test8(i32 %k) {
+; CHECK-LABEL: @test8(
 ; CHECK-NEXT:  for.body.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -751,8 +686,8 @@ for.end:
 }
 ; CHECK-NOT: llvm.loop.unroll.disable
 
-define void @test_10__peel_first_iter_via_slt_pred(i32 %len) {
-; CHECK-LABEL: @test_10__peel_first_iter_via_slt_pred(
+define void @test_9__peel_first_iter_via_slt_pred(i32 %len) {
+; CHECK-LABEL: @test_9__peel_first_iter_via_slt_pred(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[LEN:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -818,8 +753,8 @@ if.end:                                           ; preds = %if.then, %for.body
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-define void @test_11__peel_first_iter_via_sgt_pred(i32 %len) {
-; CHECK-LABEL: @test_11__peel_first_iter_via_sgt_pred(
+define void @test_10__peel_first_iter_via_sgt_pred(i32 %len) {
+; CHECK-LABEL: @test_10__peel_first_iter_via_sgt_pred(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[LEN:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -887,8 +822,8 @@ if.end:                                           ; preds = %if.then, %for.body
 
 ; NOTE: here we should only peel the first iteration,
 ;       i.e. all calls to sink() must stay in loop.
-define void @test12__peel_first_iter_via_eq_pred(i32 %len) {
-; CHECK-LABEL: @test12__peel_first_iter_via_eq_pred(
+define void @test11__peel_first_iter_via_eq_pred(i32 %len) {
+; CHECK-LABEL: @test11__peel_first_iter_via_eq_pred(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[LEN:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -956,8 +891,8 @@ if.end:                                           ; preds = %if.then, %for.body
 
 ; NOTE: here we should only peel the first iteration,
 ;       i.e. all calls to sink() must stay in loop.
-define void @test13__peel_first_iter_via_ne_pred(i32 %len) {
-; CHECK-LABEL: @test13__peel_first_iter_via_ne_pred(
+define void @test12__peel_first_iter_via_ne_pred(i32 %len) {
+; CHECK-LABEL: @test12__peel_first_iter_via_ne_pred(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[LEN:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -1024,8 +959,8 @@ if.end:                                           ; preds = %if.then, %for.body
 }
 
 ; No peeling is profitable here.
-define void @test14__ivar_mod2_is_1(i32 %len) {
-; CHECK-LABEL: @test14__ivar_mod2_is_1(
+define void @test13__ivar_mod2_is_1(i32 %len) {
+; CHECK-LABEL: @test13__ivar_mod2_is_1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[LEN:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -1074,8 +1009,8 @@ if.end:                                           ; preds = %if.then, %for.body
 }
 
 ; No peeling is profitable here.
-define void @test15__ivar_mod2_is_0(i32 %len) {
-; CHECK-LABEL: @test15__ivar_mod2_is_0(
+define void @test14__ivar_mod2_is_0(i32 %len) {
+; CHECK-LABEL: @test14__ivar_mod2_is_0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[LEN:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -1123,10 +1058,10 @@ if.end:                                           ; preds = %if.then, %for.body
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-; Similar to @test7, we need to peel one extra iteration, and we can't do that
+; Similar to @test6, we need to peel one extra iteration, and we can't do that
 ; as per the -unroll-peel-max-count=4, so this shouldn't be peeled at all.
-define void @test16(i32 %k) {
-; CHECK-LABEL: @test16(
+define void @test15(i32 %k) {
+; CHECK-LABEL: @test15(
 ; CHECK-NEXT:  for.body.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -1164,10 +1099,10 @@ for.end:
   ret void
 }
 
-; Similar to @test8, we need to peel one extra iteration, and we can't do that
+; Similar to @test7, we need to peel one extra iteration, and we can't do that
 ; as per the -unroll-peel-max-count=4, so this shouldn't be peeled at all.
-define void @test17(i32 %k) {
-; CHECK-LABEL: @test17(
+define void @test16(i32 %k) {
+; CHECK-LABEL: @test16(
 ; CHECK-NEXT:  for.body.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-nests.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-nests.ll
new file mode 100644
index 00000000000000..dc1d9be860736b
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-nests.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -loop-unroll -unroll-peel-max-count=4 -verify-dom-info | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-peel-max-count=4 -unroll-allow-loop-nests-peeling -verify-dom-info | FileCheck %s --check-prefix PEELED
+
+declare void @f1()
+declare void @f2()
+
+; In this case we cannot peel the inner loop, because the condition involves
+; the outer induction variable.
+; Peel the loop nest if allowed by the flag -unroll-allow-loop-nests-peeling.
+define void @test1(i32 %k) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  for.body.lr.ph:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH:%.*]] ], [ [[J_INC:%.*]], [[OUTER_INC:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[J]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @f1()
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_05]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[K:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[OUTER_INC]]
+; CHECK:       outer.inc:
+; CHECK-NEXT:    [[J_INC]] = add nsw i32 [[J]], 1
+; CHECK-NEXT:    [[OUTER_CMP:%.*]] = icmp slt i32 [[J_INC]], [[K]]
+; CHECK-NEXT:    br i1 [[OUTER_CMP]], label [[OUTER_HEADER]], label [[FOR_END:%.*]], !llvm.loop !{{.*}}
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+; PEELED-LABEL: @test1(
+; PEELED-NEXT:  for.body.lr.ph:
+; PEELED-NEXT:    br label [[OUTER_HEADER_PEEL_BEGIN:%.*]]
+; PEELED:       outer.header.peel.begin:
+; PEELED-NEXT:    br label [[OUTER_HEADER_PEEL:%.*]]
+; PEELED:       outer.header.peel:
+; PEELED-NEXT:    br label [[FOR_BODY_PEEL:%.*]]
+; PEELED:       for.body.peel:
+; PEELED-NEXT:    [[I_05_PEEL:%.*]] = phi i32 [ 0, [[OUTER_HEADER_PEEL]] ], [ [[INC_PEEL:%.*]], [[FOR_INC_PEEL:%.*]] ]
+; PEELED-NEXT:    [[CMP1_PEEL:%.*]] = icmp ult i32 0, 2
+; PEELED-NEXT:    br i1 [[CMP1_PEEL]], label [[IF_THEN_PEEL:%.*]], label [[IF_ELSE_PEEL:%.*]]
+; PEELED:       if.else.peel:
+; PEELED-NEXT:    call void @f2()
+; PEELED-NEXT:    br label [[FOR_INC_PEEL]]
+; PEELED:       if.then.peel:
+; PEELED-NEXT:    call void @f1()
+; PEELED-NEXT:    br label [[FOR_INC_PEEL]]
+; PEELED:       for.inc.peel:
+; PEELED-NEXT:    [[INC_PEEL]] = add nsw i32 [[I_05_PEEL]], 1
+; PEELED-NEXT:    [[CMP_PEEL:%.*]] = icmp slt i32 [[INC_PEEL]], [[K:%.*]]
+; PEELED-NEXT:    br i1 [[CMP_PEEL]], label [[FOR_BODY_PEEL]], label [[OUTER_INC_PEEL:%.*]]
+; PEELED:       outer.inc.peel:
+; PEELED-NEXT:    [[J_INC_PEEL:%.*]] = add nsw i32 0, 1
+; PEELED-NEXT:    [[OUTER_CMP_PEEL:%.*]] = icmp slt i32 [[J_INC_PEEL]], [[K]]
+; PEELED-NEXT:    br i1 [[OUTER_CMP_PEEL]], label [[OUTER_HEADER_PEEL_NEXT:%.*]], label [[FOR_END:%[^,]*]]
+; Verify that MD_loop metadata is dropped.
+; PEELED-NOT:   , !llvm.loop !{{[0-9]*}}
+; PEELED:       outer.header.peel.next:
+; PEELED-NEXT:    br label [[OUTER_HEADER_PEEL2:%.*]]
+; PEELED:       outer.header.peel2:
+; PEELED-NEXT:    br label [[FOR_BODY_PEEL3:%.*]]
+; PEELED:       for.body.peel3:
+; PEELED-NEXT:    [[I_05_PEEL4:%.*]] = phi i32 [ 0, [[OUTER_HEADER_PEEL2]] ], [ [[INC_PEEL9:%.*]], [[FOR_INC_PEEL8:%.*]] ]
+; PEELED-NEXT:    [[CMP1_PEEL5:%.*]] = icmp ult i32 [[J_INC_PEEL]], 2
+; PEELED-NEXT:    br i1 [[CMP1_PEEL5]], label [[IF_THEN_PEEL7:%.*]], label [[IF_ELSE_PEEL6:%.*]]
+; PEELED:       if.else.peel6:
+; PEELED-NEXT:    call void @f2()
+; PEELED-NEXT:    br label [[FOR_INC_PEEL8]]
+; PEELED:       if.then.peel7:
+; PEELED-NEXT:    call void @f1()
+; PEELED-NEXT:    br label [[FOR_INC_PEEL8]]
+; PEELED:       for.inc.peel8:
+; PEELED-NEXT:    [[INC_PEEL9]] = add nsw i32 [[I_05_PEEL4]], 1
+; PEELED-NEXT:    [[CMP_PEEL10:%.*]] = icmp slt i32 [[INC_PEEL9]], [[K]]
+; PEELED-NEXT:    br i1 [[CMP_PEEL10]], label [[FOR_BODY_PEEL3]], label [[OUTER_INC_PEEL11:%.*]]
+; PEELED:       outer.inc.peel11:
+; PEELED-NEXT:    [[J_INC_PEEL12:%.*]] = add nsw i32 [[J_INC_PEEL]], 1
+; PEELED-NEXT:    [[OUTER_CMP_PEEL13:%.*]] = icmp slt i32 [[J_INC_PEEL12]], [[K]]
+; PEELED-NEXT:    br i1 [[OUTER_CMP_PEEL13]], label [[OUTER_HEADER_PEEL_NEXT1:%.*]], label [[FOR_END]]
+; Verify that MD_loop metadata is dropped.
+; PEELED-NOT:   , !llvm.loop !{{[0-9]*}}
+; PEELED:       outer.header.peel.next1:
+; PEELED-NEXT:    br label [[OUTER_HEADER_PEEL_NEXT14:%.*]]
+; PEELED:       outer.header.peel.next14:
+; PEELED-NEXT:    br label [[FOR_BODY_LR_PH_PEEL_NEWPH:%.*]]
+; PEELED:       for.body.lr.ph.peel.newph:
+; PEELED-NEXT:    br label [[OUTER_HEADER:%.*]]
+; PEELED:       outer.header:
+; PEELED-NEXT:    [[J:%.*]] = phi i32 [ [[J_INC_PEEL12]], [[FOR_BODY_LR_PH_PEEL_NEWPH]] ], [ [[J_INC:%.*]], [[OUTER_INC:%.*]] ]
+; PEELED-NEXT:    br label [[FOR_BODY:%.*]]
+; PEELED:       for.body:
+; PEELED-NEXT:    [[I_05:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; PEELED-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; PEELED:       if.then:
+; PEELED-NEXT:    call void @f1()
+; PEELED-NEXT:    br label [[FOR_INC]]
+; PEELED:       if.else:
+; PEELED-NEXT:    call void @f2()
+; PEELED-NEXT:    br label [[FOR_INC]]
+; PEELED:       for.inc:
+; PEELED-NEXT:    [[INC]] = add nsw i32 [[I_05]], 1
+; PEELED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[K]]
+; PEELED-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[OUTER_INC]]
+; PEELED:       outer.inc:
+; PEELED-NEXT:    [[J_INC]] = add nuw nsw i32 [[J]], 1
+; PEELED-NEXT:    [[OUTER_CMP:%.*]] = icmp slt i32 [[J_INC]], [[K]]
+; PEELED-NEXT:    br i1 [[OUTER_CMP]], label [[OUTER_HEADER]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop !{{.*}}
+; PEELED:       for.end.loopexit:
+; PEELED-NEXT:    br label [[FOR_END]]
+; PEELED:       for.end:
+; PEELED-NEXT:    ret void
+;
+for.body.lr.ph:
+  br label %outer.header
+
+outer.header:
+  %j = phi i32 [ 0, %for.body.lr.ph ], [ %j.inc, %outer.inc ]
+  br label %for.body
+
+for.body:
+  %i.05 = phi i32 [ 0, %outer.header ], [ %inc, %for.inc ]
+  %cmp1 = icmp ult i32 %j, 2
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:
+  call void @f1()
+  br label %for.inc
+
+if.else:
+  call void @f2()
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %inc, %k
+  br i1 %cmp, label %for.body, label %outer.inc
+
+outer.inc:
+  %j.inc = add nsw i32 %j, 1
+  %outer.cmp = icmp slt i32 %j.inc, %k
+  br i1 %outer.cmp, label %outer.header, label %for.end, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
new file mode 100644
index 00000000000000..0886b8eca2ef3d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -0,0 +1,1358 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+; CHECK-LABEL: @reduction_sum(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[TMP10]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[SUM_02]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP15]]
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP20]], [[TMP17]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !2
+; CHECK:       ._crit_edge.loopexit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+; CHECK-LABEL: @reduction_prod(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP9]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11]] = mul <4 x i32> [[TMP10]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 1, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PROD_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[PROD_02]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[TMP15]]
+; CHECK-NEXT:    [[TMP21]] = mul i32 [[TMP20]], [[TMP17]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !5
+; CHECK:       ._crit_edge.loopexit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[PROD_0_LCSSA:%.*]] = phi i32 [ 1, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[PROD_0_LCSSA]]
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = mul i32 %prod.02, %6
+  %8 = mul i32 %7, %3
+  %9 = mul i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
+  ret i32 %prod.0.lcssa
+}
+
+define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+; CHECK-LABEL: @reduction_mix(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nsw i32 [[TMP17]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[SUM_02]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP20]], [[TMP18]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !7
+; CHECK:       ._crit_edge.loopexit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = mul nsw i32 %5, %3
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %sum.02, %7
+  %9 = add i32 %8, %6
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+; CHECK-LABEL: @reduction_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 19, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP10]] = mul <4 x i32> [[TMP9]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 19, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[TMP18:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[SUM_02]], [[TMP14]]
+; CHECK-NEXT:    [[TMP18]] = mul i32 [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !9
+; CHECK:       ._crit_edge.loopexit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[DOTLR_PH]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %7, %.lr.ph ], [ 19, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = mul i32 %sum.02, %3
+  %7 = mul i32 %6, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
+; CHECK-LABEL: @start_at_non_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 120, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[COEFF:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[TMP7]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 120, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM_09:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[COEFF]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[SUM_09]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !11
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 120, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum.09
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+; CHECK-LABEL: @reduction_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8]] = and <4 x i32> [[TMP7]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ -1, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = and i32 [[RESULT_08]], [[TMP11]]
+; CHECK-NEXT:    [[AND]] = and i32 [[ADD]], [[TMP12]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !13
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[AND_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = and i32 %result.08, %0
+  %and = and i32 %add, %1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+; CHECK-LABEL: @reduction_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8]] = or <4 x i32> [[TMP7]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[OR]] = or i32 [[ADD]], [[RESULT_08]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !15
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+; CHECK-LABEL: @reduction_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8]] = xor <4 x i32> [[TMP7]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[XOR]] = xor i32 [[ADD]], [[RESULT_08]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !17
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[XOR_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %xor = xor i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define float @reduction_fadd(i32 %n, float* nocapture %A, float* nocapture %B) nounwind uwtable readonly {
+; CHECK-LABEL: @reduction_fadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8]] = fadd fast <4 x float> [[TMP7]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP8]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[TMP11]]
+; CHECK-NEXT:    [[FADD]] = fadd fast float [[ADD]], [[TMP12]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !19
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[FADD_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi float [ %fadd, %for.body ], [ 0.0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %result.08, %0
+  %fadd = fadd fast float %add, %1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi float [ 0.0, %entry ], [ %fadd, %for.body ]
+  ret float %result.0.lcssa
+}
+
+define float @reduction_fmul(i32 %n, float* nocapture %A, float* nocapture %B) nounwind uwtable readonly {
+; CHECK-LABEL: @reduction_fmul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8]] = fmul fast <4 x float> [[TMP7]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP8]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FMUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[TMP11]]
+; CHECK-NEXT:    [[FMUL]] = fmul fast float [[ADD]], [[TMP12]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !21
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[FMUL_LCSSA:%.*]] = phi float [ [[FMUL]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[FMUL_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fmul fast float %result.08, %0
+  %fmul = fmul fast float %add, %1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi float [ 0.0, %entry ], [ %fmul, %for.body ]
+  ret float %result.0.lcssa
+}
+
+define i32 @reduction_min(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+; CHECK-LABEL: @reduction_min(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 1000, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[RESULT_08]], [[TMP9]]
+; CHECK-NEXT:    [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[TMP9]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !23
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[V0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[V0_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %c0 = icmp slt i32 %result.08, %0
+  %v0 = select i1 %c0, i32 %result.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %v0, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define i32 @reduction_max(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+; CHECK-LABEL: @reduction_max(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !24
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 1000, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C0:%.*]] = icmp ugt i32 [[RESULT_08]], [[TMP9]]
+; CHECK-NEXT:    [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[TMP9]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !25
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[V0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[V0_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %c0 = icmp ugt i32 %result.08, %0
+  %v0 = select i1 %c0, i32 %result.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %v0, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+; Sub we can create a reduction, but not inloop
+define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
+; CHECK-LABEL: @reduction_sub_lhs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !26
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SUB]] = sub nsw i32 [[X_05]], [[TMP8]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !27
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUB_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %sub = sub nsw i32 %x.05, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+  ret i32 %x.0.lcssa
+}
+
+; Conditional reductions with multi-input phis.
+define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
+; CHECK-LABEL: @reduction_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[S:%.*]], i32 0
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[TMP1]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP13]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !28
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN8:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then8:
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN16:%.*]], label [[FOR_INC]]
+; CHECK:       if.then16:
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !29
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ undef, [[FOR_INC]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[SUM_1_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %cmp3 = fcmp ogt float %0, %1
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:
+  %cmp6 = fcmp ogt float %1, 1.000000e+00
+  br i1 %cmp6, label %if.then8, label %if.else
+
+if.then8:
+  %add = fadd fast float %sum.033, %0
+  br label %for.inc
+
+if.else:
+  %cmp14 = fcmp ogt float %0, 2.000000e+00
+  br i1 %cmp14, label %if.then16, label %for.inc
+
+if.then16:
+  %add19 = fadd fast float %sum.033, %1
+  br label %for.inc
+
+for.inc:
+  %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
+  ret float %sum.1.lcssa
+}
+
+define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @reduction_sum_multiuse(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[END:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[TMP10]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !30
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[SUM_02]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP15]]
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP20]], [[TMP17]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !31
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_COPY:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[F1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[SUM_LCSSA]], [[DOT_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[F2:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[SUM_COPY]], [[DOT_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[FINAL:%.*]] = add i32 [[F1]], [[F2]]
+; CHECK-NEXT:    ret i32 [[FINAL]]
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph.preheader, label %end
+.lr.ph.preheader:                                 ; preds = %0
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.lcssa = phi i32 [ %9, %.lr.ph ]
+  %sum.copy = phi i32 [ %9, %.lr.ph ]
+  br label %end
+
+end:
+  %f1 = phi i32 [ 0, %0 ], [ %sum.lcssa, %._crit_edge ]
+  %f2 = phi i32 [ 0, %0 ], [ %sum.copy, %._crit_edge ]
+  %final = add i32 %f1, %f2
+  ret i32 %final
+}
+
+; Predicated loop, cannot (yet) use in-loop reductions.
+define i32 @reduction_predicated(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+; CHECK-LABEL: @reduction_predicated(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[PRED_LOAD_CONTINUE14]] ]
+; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[PRED_LOAD_CONTINUE14]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> undef, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x i32> [ undef, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP15]], i32 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i32 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP21]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP25]], i32 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; CHECK:       pred.load.if7:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> undef, i32 [[TMP30]], i32 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
+; CHECK:       pred.load.continue8:
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP31]], [[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
+; CHECK:       pred.load.if9:
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP35]], i32 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
+; CHECK:       pred.load.continue10:
+; CHECK-NEXT:    [[TMP37:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP40]], i32 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; CHECK:       pred.load.continue12:
+; CHECK-NEXT:    [[TMP42:%.*]] = phi <4 x i32> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP41]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; CHECK-NEXT:    br i1 [[TMP43]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP45]], i32 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.continue14:
+; CHECK-NEXT:    [[TMP47:%.*]] = phi <4 x i32> [ [[TMP42]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP46]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP48:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND15]]
+; CHECK-NEXT:    [[TMP49:%.*]] = add <4 x i32> [[TMP48]], [[TMP27]]
+; CHECK-NEXT:    [[TMP50]] = add <4 x i32> [[TMP49]], [[TMP47]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !32
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[TMP50]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP53:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP52]])
+; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !33
+; CHECK:       ._crit_edge.loopexit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !6
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/Util/dbg-call-bitcast.ll b/llvm/test/Transforms/Util/dbg-call-bitcast.ll
index 6625b469b06e4a..2d602c13635e22 100644
--- a/llvm/test/Transforms/Util/dbg-call-bitcast.ll
+++ b/llvm/test/Transforms/Util/dbg-call-bitcast.ll
@@ -10,6 +10,26 @@ define dso_local void @_Z1fv() {
 ; CHECK: call void @_Z1gPv
   call void @_Z1gPv(i8* nonnull %2)
   %3 = bitcast i32* %1 to i8*
+; CHECK-NOT: call void @llvm.dbg.value
+; CHECK: call void @_Z1gPv
+  call void @_Z1gPv(i8* nonnull %3)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2)
+  ret void, !dbg !21
+}
+
+define dso_local void @_Z2fv() {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2)
+  call void @llvm.dbg.declare(metadata i32* %1, metadata !16, metadata !DIExpression()), !dbg !19
+; CHECK: %[[A:.*]] = alloca i32, align 4
+; CHECK: call void @llvm.dbg.value(metadata i32* %[[A]], {{.*}}, metadata !DIExpression(DW_OP_deref)
+; CHECK: call void @_Z1gPv
+  call void @_Z1gPv(i8* nonnull %2)
+  br label %block2
+
+block2:
+  %3 = bitcast i32* %1 to i8*
 ; CHECK: call void @llvm.dbg.value(metadata i32* %[[A]], {{.*}}, metadata !DIExpression(DW_OP_deref)
 ; CHECK: call void @_Z1gPv
   call void @_Z1gPv(i8* nonnull %3)
diff --git a/llvm/test/Transforms/WholeProgramDevirt/branch-funnel.ll b/llvm/test/Transforms/WholeProgramDevirt/branch-funnel.ll
index bf7c8547f27102..32d964819fee5c 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/branch-funnel.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/branch-funnel.ll
@@ -10,7 +10,7 @@
 ; SUMMARY:      TypeIdMap:       
 ; SUMMARY-NEXT:   typeid3:
 ; SUMMARY-NEXT:     TTRes:           
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
@@ -23,7 +23,7 @@
 ; SUMMARY-NEXT:         ResByArg:        
 ; SUMMARY-NEXT:   typeid1:
 ; SUMMARY-NEXT:     TTRes:           
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
@@ -36,7 +36,7 @@
 ; SUMMARY-NEXT:         ResByArg:        
 ; SUMMARY-NEXT:   typeid2:
 ; SUMMARY-NEXT:     TTRes:           
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
diff --git a/llvm/test/Transforms/WholeProgramDevirt/devirt-single-impl2.ll b/llvm/test/Transforms/WholeProgramDevirt/devirt-single-impl2.ll
index 7c85114239cf27..63ccfb833d4560 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/devirt-single-impl2.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/devirt-single-impl2.ll
@@ -14,7 +14,7 @@
 ; RUN:     -wholeprogramdevirt-summary-action=export -o /dev/null 2>&1 | FileCheck %s --check-prefix=MISSING-MODULE
 
 ; Check single impl devirtulation in summary
-; CHECK: typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: unknown, sizeM1BitWidth: 0), wpdResolutions: ((offset: 0, wpdRes: (kind: singleImpl, singleImplName: "_ZNK1A1fEv"))))) ; guid
+; CHECK: typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: unsat, sizeM1BitWidth: 0), wpdResolutions: ((offset: 0, wpdRes: (kind: singleImpl, singleImplName: "_ZNK1A1fEv"))))) ; guid
 
 ; MISSING-MODULE: combined summary should contain Regular LTO module
 
diff --git a/llvm/test/Transforms/WholeProgramDevirt/export-single-impl.ll b/llvm/test/Transforms/WholeProgramDevirt/export-single-impl.ll
index 861f5f6584898a..33ff9e1afe50f6 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/export-single-impl.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/export-single-impl.ll
@@ -4,7 +4,7 @@
 ; SUMMARY:      TypeIdMap:
 ; SUMMARY-NEXT:   typeid3:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
@@ -17,7 +17,7 @@
 ; SUMMARY-NEXT:         ResByArg:
 ; SUMMARY-NEXT:   typeid1:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
@@ -30,7 +30,7 @@
 ; SUMMARY-NEXT:         ResByArg:
 ; SUMMARY-NEXT:   typeid2:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
@@ -43,7 +43,7 @@
 ; SUMMARY-NEXT:         ResByArg:
 ; SUMMARY-NEXT:   typeid4:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
diff --git a/llvm/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll b/llvm/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
index 634eaa12196eb0..cb2fddd75d1d0e 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
@@ -6,7 +6,7 @@
 ; SUMMARY:      TypeIdMap:
 ; SUMMARY-NEXT:   typeid4:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
diff --git a/llvm/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll b/llvm/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
index 7b646341ece278..0f780a3873687c 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
@@ -6,7 +6,7 @@
 ; SUMMARY:      TypeIdMap:
 ; SUMMARY-NEXT:   typeid3:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
@@ -24,7 +24,7 @@
 ; SUMMARY-NEXT:             Bit:             0
 ; SUMMARY-NEXT:   typeid4:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
diff --git a/llvm/test/Transforms/WholeProgramDevirt/export-vcp.ll b/llvm/test/Transforms/WholeProgramDevirt/export-vcp.ll
index e33abd259625a8..eb7b36e87dd62b 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/export-vcp.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/export-vcp.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:64:64"
 ; SUMMARY:      TypeIdMap:
 ; SUMMARY-NEXT:   typeid3:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
@@ -29,7 +29,7 @@ target datalayout = "e-p:64:64"
 ; SUMMARY-ARM-NEXT:         Bit:             1
 ; SUMMARY-NEXT:   typeid4:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
diff --git a/llvm/test/Transforms/WholeProgramDevirt/import-indir.ll b/llvm/test/Transforms/WholeProgramDevirt/import-indir.ll
index 19ee68be955a0f..5c2be7d8629631 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/import-indir.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/import-indir.ll
@@ -32,7 +32,7 @@
 ; SUMMARY-NEXT: TypeIdMap:
 ; SUMMARY-NEXT:   typeid1:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Unknown
+; SUMMARY-NEXT:       Kind:            Unsat
 ; SUMMARY-NEXT:       SizeM1BitWidth:  0
 ; SUMMARY-NEXT:       AlignLog2:       0
 ; SUMMARY-NEXT:       SizeM1:          0
diff --git a/llvm/test/Transforms/WholeProgramDevirt/uniform-retval.ll b/llvm/test/Transforms/WholeProgramDevirt/uniform-retval.ll
index 16f9ef822d6f3c..7626aba24c1ab5 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/uniform-retval.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/uniform-retval.ll
@@ -25,7 +25,7 @@ define i32 @call(i8* %obj) {
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i32 (i8*)*
   %result = call i32 %fptr_casted(i8* %obj)
-  ; CHECK-NOT: call i32 %
+  ; CHECK-NOT: call
   ; CHECK: ret i32 123
   ret i32 %result
 }
diff --git a/llvm/test/Verifier/disubprogram-name-match-only.ll b/llvm/test/Verifier/disubprogram-name-match-only.ll
new file mode 100644
index 00000000000000..ae23ae201d55a5
--- /dev/null
+++ b/llvm/test/Verifier/disubprogram-name-match-only.ll
@@ -0,0 +1,26 @@
+; RUN: llvm-as -disable-output <%s 2>&1| FileCheck %s
+
+define void @f() !dbg !14 {
+  ret void, !dbg !5
+}
+
+!llvm.module.flags = !{!15}
+!llvm.dbg.cu = !{!4}
+
+!0 = !{null}
+!1 = distinct !DICompositeType(tag: DW_TAG_structure_type)
+!2 = !DIFile(filename: "f.c", directory: "/")
+!3 = !DISubroutineType(types: !0)
+!4 = distinct !DICompileUnit(language: DW_LANG_C, file: !2)
+; CHECK: !dbg attachment points at wrong subprogram for function
+; CHECK: warning: ignoring invalid debug info
+!5 = !DILocation(line: 1, scope: !9)
+!9 = distinct !DISubprogram(name: "f", scope: !1,
+                            file: !2, line: 1, type: !3, isLocal: true,
+                            isDefinition: true, scopeLine: 2,
+                            unit: !4)
+!14 = distinct !DISubprogram(name: "f", scope: !1,
+                            file: !2, line: 1, type: !3, isLocal: true,
+                            isDefinition: true, scopeLine: 2,
+                            unit: !4)
+!15 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index b872a8a0a6edeb..6f4d5f79082813 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -33,7 +33,7 @@ config.host_cxx = "@HOST_CXX@"
 config.host_ldflags = '@HOST_LDFLAGS@'
 config.llvm_use_intel_jitevents = @LLVM_USE_INTEL_JITEVENTS@
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
-config.have_zlib = @LLVM_ENABLE_ZLIB@
+config.have_zlib = @HAVE_LIBZ@
 config.have_libxar = @HAVE_LIBXAR@
 config.have_dia_sdk = @LLVM_ENABLE_DIA_SDK@
 config.enable_ffi = @LLVM_ENABLE_FFI@
diff --git a/llvm/test/tools/llvm-ar/response-utf8.test b/llvm/test/tools/llvm-ar/response-utf8.test
new file mode 100644
index 00000000000000..b3e405f854902c
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/response-utf8.test
@@ -0,0 +1,11 @@
+## Check that response files can cope with non-ascii characters.
+
+# RUN: echo 'contents' > %t-£.txt
+
+# RUN: rm -f %t-£.a
+# RUN: echo 'r %t-£.a %t-£.txt' > %t-replace.txt
+# RUN: llvm-ar @%t-replace.txt
+
+# RUN: echo 'p %t-£.a %t-£.txt' > %t-print.txt
+# RUN: llvm-ar @%t-print.txt | FileCheck %s
+# CHECK: contents
diff --git a/llvm/test/tools/llvm-ar/response.test b/llvm/test/tools/llvm-ar/response.test
new file mode 100644
index 00000000000000..a08a63e88182b7
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/response.test
@@ -0,0 +1,34 @@
+## llvm-ar should be able to consume response files.
+
+# RUN: echo 'contents' > %t.txt
+# RUN: echo 'rc %t1.a %t.txt' > %t.response1.txt
+# RUN: llvm-ar @%t.response1.txt
+# RUN: llvm-ar p %t1.a | FileCheck %s --check-prefix=CONTENTS
+
+## Quotes and Spaces.
+# RUN: echo 'contents' > '%t space.txt'
+## Python is used here to ensure the quotes are written to the response file
+# RUN: %python -c "import os; open(r'%t.response2.txt', 'w').write(r'%t2.a \"%t space.txt\"'+ '\n')"
+# RUN: llvm-ar rc @%t.response2.txt
+# RUN: llvm-ar p %t2.a | FileCheck %s --check-prefix=CONTENTS
+
+## Arguments after the response file.
+# RUN: echo 'rc %t3.a' > %t.response3.txt
+# RUN: llvm-ar @%t.response3.txt %t.txt
+# RUN: llvm-ar p %t3.a | FileCheck %s --check-prefix=CONTENTS
+
+# CONTENTS: contents
+
+## rsp-quoting
+# RUN: not llvm-ar --rsp-quoting=foobar @%t.response1.txt 2>&1 | \
+# RUN:   FileCheck  %s --check-prefix=ERROR
+# ERROR: Invalid response file quoting style foobar
+
+# RUN: echo -e 'rc %/t.a blah\\foo' > %t-rsp.txt
+# RUN: not llvm-ar --rsp-quoting=windows @%t-rsp.txt 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=WIN
+# WIN: error: blah\foo: {{[Nn]}}o such file or directory
+
+# RUN: not llvm-ar -rsp-quoting posix @%t-rsp.txt 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=POSIX
+# POSIX: error: blahfoo: {{[Nn]}}o such file or directory
diff --git a/llvm/test/tools/llvm-gsymutil/fat-macho-dwarf.yaml b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/fat-macho-dwarf.yaml
similarity index 100%
rename from llvm/test/tools/llvm-gsymutil/fat-macho-dwarf.yaml
rename to llvm/test/tools/llvm-gsymutil/ARM_AArch64/fat-macho-dwarf.yaml
diff --git a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/lit.local.cfg b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/lit.local.cfg
new file mode 100644
index 00000000000000..e06c15ef14138a
--- /dev/null
+++ b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/lit.local.cfg
@@ -0,0 +1,4 @@
+if not ('ARM' in config.root.targets and 'AArch64' in config.root.targets):
+    config.unsupported = True
+
+config.suffixes = ['.test', '.yaml']
diff --git a/llvm/test/tools/llvm-gsymutil/elf-dwarf.yaml b/llvm/test/tools/llvm-gsymutil/X86/elf-dwarf.yaml
similarity index 100%
rename from llvm/test/tools/llvm-gsymutil/elf-dwarf.yaml
rename to llvm/test/tools/llvm-gsymutil/X86/elf-dwarf.yaml
diff --git a/llvm/test/tools/llvm-gsymutil/X86/lit.local.cfg b/llvm/test/tools/llvm-gsymutil/X86/lit.local.cfg
new file mode 100644
index 00000000000000..52c762f5cfb8b1
--- /dev/null
+++ b/llvm/test/tools/llvm-gsymutil/X86/lit.local.cfg
@@ -0,0 +1,4 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
+config.suffixes = ['.test', '.yaml']
diff --git a/llvm/test/tools/llvm-gsymutil/mach-dwarf.yaml b/llvm/test/tools/llvm-gsymutil/X86/mach-dwarf.yaml
similarity index 100%
rename from llvm/test/tools/llvm-gsymutil/mach-dwarf.yaml
rename to llvm/test/tools/llvm-gsymutil/X86/mach-dwarf.yaml
diff --git a/llvm/test/tools/obj2yaml/duplicate-symbol-and-section-names.test b/llvm/test/tools/obj2yaml/duplicate-symbol-and-section-names.test
index 9dc198392c9b91..0eba412ffa471e 100644
--- a/llvm/test/tools/obj2yaml/duplicate-symbol-and-section-names.test
+++ b/llvm/test/tools/obj2yaml/duplicate-symbol-and-section-names.test
@@ -125,13 +125,7 @@ Symbols:
 # RUN: yaml2obj --docnum=3 %s -o %t3
 # RUN: obj2yaml %t3 | FileCheck %s --check-prefix=CASE3
 
-# CASE3:      --- !ELF
-# CASE3-NEXT: FileHeader:
-# CASE3-NEXT:   Class:   ELFCLASS64
-# CASE3-NEXT:   Data:    ELFDATA2LSB
-# CASE3-NEXT:   Type:    ET_DYN
-# CASE3-NEXT:   Machine: EM_X86_64
-# CASE3-NEXT: Symbols:
+# CASE3:      Symbols:
 # CASE3-NEXT:   - Name:    foo
 # CASE3-NEXT:     Binding: STB_GLOBAL
 # CASE3-NEXT: DynamicSymbols:
diff --git a/llvm/test/tools/obj2yaml/implicit-sections-order.yaml b/llvm/test/tools/obj2yaml/implicit-sections-order.yaml
new file mode 100644
index 00000000000000..555b1f3edc0423
--- /dev/null
+++ b/llvm/test/tools/obj2yaml/implicit-sections-order.yaml
@@ -0,0 +1,163 @@
+## Check that obj2yaml dumps SHT_STRTAB/SHT_SYMTAB/SHT_DYNSYM sections
+## when they are allocatable.
+
+## In the following test we check the normal case: when .dynsym (SHT_DYNSYM)
+## and .dynstr (SHT_STRTAB) are allocatable sections and .symtab (SHT_SYMTAB),
+## .strtab (SHT_STRTAB) and .shstrtab (SHT_STRTAB) are not.
+## Check we explicitly declare allocatable sections.
+
+# RUN: yaml2obj %s -o %t1.so -D FLAG1=SHF_ALLOC -D FLAG2=""
+# RUN: llvm-readelf -S %t1.so | FileCheck %s --check-prefixes=RE,RE-1
+# RUN: obj2yaml %t1.so | FileCheck %s --check-prefix=OUTPUT
+
+## Check the information about sections using an independent tool.
+
+# RE:        Section Headers:
+# RE-NEXT:   [Nr] Name      Type     Address          Off    Size   ES Flg Lk Inf Al
+# RE-NEXT:   [ 0]           NULL     0000000000000000 000000 000000 00     0   0  0
+# RE-NEXT:   [ 1] .foo.1    PROGBITS 0000000000000000 000040 000000 00     0   0  0
+# RE-1-NEXT: [ 2] .dynsym   DYNSYM   0000000000001000 000040 000030 18   A 4   2  0
+# RE-2-NEXT: [ 2] .dynsym   DYNSYM   0000000000001000 000040 000030 18     4   2  0
+# RE-NEXT:   [ 3] .foo.2    PROGBITS 0000000000000000 000070 000000 00     0   0  0
+# RE-1-NEXT: [ 4] .dynstr   STRTAB   0000000000002000 000070 000005 00   A 0   0  0
+# RE-2-NEXT: [ 4] .dynstr   STRTAB   0000000000002000 000070 000005 00     0   0  0
+# RE-NEXT:   [ 5] .foo.3    PROGBITS 0000000000000000 000075 000000 00     0   0  0
+# RE-1-NEXT: [ 6] .symtab   SYMTAB   0000000000003000 000075 000030 18     8   2  0
+# RE-2-NEXT: [ 6] .symtab   SYMTAB   0000000000003000 000075 000030 18   A 8   2  0
+# RE-NEXT:   [ 7] .foo.4    PROGBITS 0000000000000000 0000a5 000000 00     0   0  0
+# RE-1-NEXT: [ 8] .strtab   STRTAB   0000000000004000 0000a5 000005 00     0   0  0
+# RE-2-NEXT: [ 8] .strtab   STRTAB   0000000000004000 0000a5 000005 00   A 0   0  0
+# RE-NEXT:   [ 9] .foo.5    PROGBITS 0000000000000000 0000aa 000000 00     0   0  0
+# RE-1-NEXT: [10] .shstrtab STRTAB   0000000000005000 0000aa 000055 00     0   0  0
+# RE-2-NEXT: [10] .shstrtab STRTAB   0000000000005000 0000aa 000055 00   A 0   0  0
+# RE-NEXT:   [11] .foo.6    PROGBITS 0000000000000000 0000ff 000000 00     0   0  0
+
+# OUTPUT:      --- !ELF
+# OUTPUT-NEXT: FileHeader:
+# OUTPUT-NEXT:   Class:   ELFCLASS64
+# OUTPUT-NEXT:   Data:    ELFDATA2LSB
+# OUTPUT-NEXT:   Type:    ET_DYN
+# OUTPUT-NEXT:   Machine: EM_X86_64
+# OUTPUT-NEXT: Sections:
+# OUTPUT-NEXT:   - Name: .foo.1
+# OUTPUT-NEXT:     Type: SHT_PROGBITS
+# OUTPUT-NEXT:   - Name:    .dynsym
+# OUTPUT-NEXT:     Type:    SHT_DYNSYM
+# OUTPUT-NEXT:     Flags:   [ SHF_ALLOC ]
+# OUTPUT-NEXT:     Address: 0x0000000000001000
+# OUTPUT-NEXT:     Link:    .dynstr
+# OUTPUT-NEXT:     EntSize: 0x0000000000000018
+# OUTPUT-NEXT:   - Name: .foo.2
+# OUTPUT-NEXT:     Type: SHT_PROGBITS
+# OUTPUT-NEXT:   - Name:    .dynstr
+# OUTPUT-NEXT:     Type:    SHT_STRTAB
+# OUTPUT-NEXT:     Flags:   [ SHF_ALLOC ]
+# OUTPUT-NEXT:     Address: 0x0000000000002000
+# OUTPUT-NEXT:   - Name: .foo.3
+# OUTPUT-NEXT:     Type: SHT_PROGBITS
+# OUTPUT-NEXT:   - Name: .foo.4
+# OUTPUT-NEXT:     Type: SHT_PROGBITS
+# OUTPUT-NEXT:   - Name: .foo.5
+# OUTPUT-NEXT:     Type: SHT_PROGBITS
+# OUTPUT-NEXT:   - Name: .foo.6
+# OUTPUT-NEXT:     Type: SHT_PROGBITS
+# OUTPUT-NEXT: Symbols:
+# OUTPUT-NEXT:   - Name: foo
+# OUTPUT-NEXT: DynamicSymbols:
+# OUTPUT-NEXT:   - Name: bar
+# OUTPUT-NEXT: ...
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name: .foo.1
+    Type: SHT_PROGBITS
+  - Name:    .dynsym
+    Type:    SHT_DYNSYM
+    Address: 0x1000
+    Flags: [ [[FLAG1]] ]
+  - Name: .foo.2
+    Type: SHT_PROGBITS
+  - Name:    .dynstr
+    Type:    SHT_STRTAB
+    Address: 0x2000
+    Flags: [ [[FLAG1]] ]
+  - Name: .foo.3
+    Type: SHT_PROGBITS
+  - Name:    .symtab
+    Type:    SHT_SYMTAB
+    Address: 0x3000
+    Flags: [ [[FLAG2]] ]
+  - Name: .foo.4
+    Type: SHT_PROGBITS
+  - Name:    .strtab
+    Type:    SHT_STRTAB
+    Address: 0x4000
+    Flags: [ [[FLAG2]] ]
+  - Name: .foo.5
+    Type: SHT_PROGBITS
+  - Name:    .shstrtab
+    Type:    SHT_STRTAB
+    Address: 0x5000
+    Flags: [ [[FLAG2]] ]
+  - Name: .foo.6
+    Type: SHT_PROGBITS
+Symbols:
+  - Name: foo
+DynamicSymbols:
+  - Name: bar
+
+## Now test the abnormal case: when .symtab (SHT_SYMTAB),
+## .strtab (SHT_STRTAB) and .shstrtab (SHT_STRTAB) are
+## allocatable sections, but .dynsym (SHT_DYNSYM) and
+## .dynstr (SHT_STRTAB) are not.
+## Check that only allocatable versions are explicitly declared.
+
+# RUN: yaml2obj %s -o %t2.so -D FLAG1="" -D FLAG2=SHF_ALLOC
+# RUN: llvm-readelf -S %t2.so | FileCheck %s --check-prefixes=RE,RE-2
+# RUN: obj2yaml %t2.so | FileCheck %s --check-prefix=OUTPUT2
+
+## Check we explicitly declare only allocatable
+## SHT_STRTAB/SHT_SYMTAB/SHT_DYNSYM sections.
+# OUTPUT2:      --- !ELF
+# OUTPUT2-NEXT: FileHeader:
+# OUTPUT2-NEXT:   Class:   ELFCLASS64
+# OUTPUT2-NEXT:   Data:    ELFDATA2LSB
+# OUTPUT2-NEXT:   Type:    ET_DYN
+# OUTPUT2-NEXT:   Machine: EM_X86_64
+# OUTPUT2-NEXT: Sections:
+# OUTPUT2-NEXT:   - Name: .foo.1
+# OUTPUT2-NEXT:     Type: SHT_PROGBITS
+# OUTPUT2-NEXT:   - Name: .foo.2
+# OUTPUT2-NEXT:     Type: SHT_PROGBITS
+# OUTPUT2-NEXT:   - Name: .foo.3
+# OUTPUT2-NEXT:     Type: SHT_PROGBITS
+# OUTPUT2-NEXT:   - Name:    .symtab
+# OUTPUT2-NEXT:     Type:    SHT_SYMTAB
+# OUTPUT2-NEXT:     Flags:   [ SHF_ALLOC ]
+# OUTPUT2-NEXT:     Address: 0x0000000000003000
+# OUTPUT2-NEXT:     Link:    .strtab
+# OUTPUT2-NEXT:     EntSize: 0x0000000000000018
+# OUTPUT2-NEXT:   - Name: .foo.4
+# OUTPUT2-NEXT:     Type: SHT_PROGBITS
+# OUTPUT2-NEXT:   - Name:    .strtab
+# OUTPUT2-NEXT:     Type:    SHT_STRTAB
+# OUTPUT2-NEXT:     Flags:   [ SHF_ALLOC ]
+# OUTPUT2-NEXT:     Address: 0x0000000000004000
+# OUTPUT2-NEXT:   - Name: .foo.5
+# OUTPUT2-NEXT:     Type: SHT_PROGBITS
+# OUTPUT2-NEXT:   - Name:    .shstrtab
+# OUTPUT2-NEXT:     Type:    SHT_STRTAB
+# OUTPUT2-NEXT:     Flags:   [ SHF_ALLOC ]
+# OUTPUT2-NEXT:     Address: 0x0000000000005000
+# OUTPUT2-NEXT:   - Name:    .foo.6
+# OUTPUT2-NEXT:     Type:    SHT_PROGBITS
+# OUTPUT2-NEXT: Symbols:
+# OUTPUT2-NEXT:   - Name: foo
+# OUTPUT2-NEXT: DynamicSymbols:
+# OUTPUT2-NEXT:   - Name: bar
+# OUTPUT2-NEXT: ...
diff --git a/llvm/test/tools/obj2yaml/versym-section.yaml b/llvm/test/tools/obj2yaml/versym-section.yaml
index 38836960615c17..0a04b3165ce210 100644
--- a/llvm/test/tools/obj2yaml/versym-section.yaml
+++ b/llvm/test/tools/obj2yaml/versym-section.yaml
@@ -19,7 +19,8 @@
 # CHECK-NEXT:     AddressAlign:    0x0000000000000002
 # CHECK-NEXT:     EntSize:         0x0000000000000002
 # CHECK-NEXT:     Entries:         [ 0, 3, 4 ]
-# CHECK-NEXT: DynamicSymbols:
+# CHECK-NEXT:   - Name:
+# CHECK:      DynamicSymbols:
 # CHECK-NEXT:   - Name:    f1
 # CHECK-NEXT:     Binding: STB_GLOBAL
 # CHECK-NEXT:   - Name:    f2
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index 401dc7f8c7d2b7..8e7a85f5a9b06d 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -83,6 +83,9 @@ USAGE: llvm-ar [options] [-]<operation>[modifiers] [relpos] [count] <archive> [f
     =bsd                -   bsd
   --plugin=<string>     - ignored for compatibility
   -h --help             - display this help and exit
+  --rsp-quoting         - quoting style for response files
+    =posix              -   posix
+    =windows            -   windows
   --version             - print the version and exit
   @<file>               - read options from <file>
 
@@ -1096,61 +1099,105 @@ static bool handleGenericOption(StringRef arg) {
   return false;
 }
 
+static const char *matchFlagWithArg(StringRef Expected,
+                                    ArrayRef<const char *>::iterator &ArgIt,
+                                    ArrayRef<const char *> Args) {
+  StringRef Arg = *ArgIt;
+
+  if (Arg.startswith("--"))
+    Arg = Arg.substr(2);
+  else if (Arg.startswith("-"))
+    Arg = Arg.substr(1);
+
+  size_t len = Expected.size();
+  if (Arg == Expected) {
+    if (++ArgIt == Args.end())
+      fail(std::string(Expected) + " requires an argument");
+
+    return *ArgIt;
+  }
+  if (Arg.startswith(Expected) && Arg.size() > len && Arg[len] == '=')
+    return Arg.data() + len + 1;
+
+  return nullptr;
+}
+
+static cl::TokenizerCallback getRspQuoting(ArrayRef<const char *> ArgsArr) {
+  cl::TokenizerCallback Ret =
+      Triple(sys::getProcessTriple()).getOS() == Triple::Win32
+          ? cl::TokenizeWindowsCommandLine
+          : cl::TokenizeGNUCommandLine;
+
+  for (ArrayRef<const char *>::iterator ArgIt = ArgsArr.begin();
+       ArgIt != ArgsArr.end(); ++ArgIt) {
+    if (const char *Match = matchFlagWithArg("rsp-quoting", ArgIt, ArgsArr)) {
+      StringRef MatchRef = Match;
+      if (MatchRef == "posix")
+        Ret = cl::TokenizeGNUCommandLine;
+      else if (MatchRef == "windows")
+        Ret = cl::TokenizeWindowsCommandLine;
+      else
+        fail(std::string("Invalid response file quoting style ") + Match);
+    }
+  }
+
+  return Ret;
+}
+
 static int ar_main(int argc, char **argv) {
-  SmallVector<const char *, 0> Argv(argv, argv + argc);
+  SmallVector<const char *, 0> Argv(argv + 1, argv + argc);
   StringSaver Saver(Alloc);
-  cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv);
-  for (size_t i = 1; i < Argv.size(); ++i) {
-    StringRef Arg = Argv[i];
-    const char *match = nullptr;
-    auto MatchFlagWithArg = [&](const char *expected) {
-      size_t len = strlen(expected);
-      if (Arg == expected) {
-        if (++i >= Argv.size())
-          fail(std::string(expected) + " requires an argument");
-        match = Argv[i];
-        return true;
-      }
-      if (Arg.startswith(expected) && Arg.size() > len && Arg[len] == '=') {
-        match = Arg.data() + len + 1;
-        return true;
-      }
-      return false;
-    };
-    if (handleGenericOption(Argv[i]))
+
+  cl::ExpandResponseFiles(Saver, getRspQuoting(makeArrayRef(argv, argc)), Argv);
+
+  ArrayRef<const char *> ArgsArr = makeArrayRef(argv, argc);
+
+  for (ArrayRef<const char *>::iterator ArgIt = Argv.begin();
+       ArgIt != Argv.end(); ++ArgIt) {
+    const char *Match = nullptr;
+
+    if (handleGenericOption(*ArgIt))
       return 0;
-    if (Arg == "--") {
-      for (; i < Argv.size(); ++i)
-        PositionalArgs.push_back(Argv[i]);
+    if (strcmp(*ArgIt, "--") == 0) {
+      ++ArgIt;
+      for (; ArgIt != Argv.end(); ++ArgIt)
+        PositionalArgs.push_back(*ArgIt);
       break;
     }
-    if (Arg[0] == '-') {
-      if (Arg.startswith("--"))
-        Arg = Argv[i] + 2;
+
+    if (*ArgIt[0] != '-') {
+      if (Options.empty())
+        Options += *ArgIt;
       else
-        Arg = Argv[i] + 1;
-      if (Arg == "M") {
-        MRI = true;
-      } else if (MatchFlagWithArg("format")) {
-        FormatType = StringSwitch<Format>(match)
-                         .Case("default", Default)
-                         .Case("gnu", GNU)
-                         .Case("darwin", DARWIN)
-                         .Case("bsd", BSD)
-                         .Default(Unknown);
-        if (FormatType == Unknown)
-          fail(std::string("Invalid format ") + match);
-      } else if (MatchFlagWithArg("plugin")) {
-        // Ignored.
-      } else {
-        Options += Argv[i] + 1;
-      }
-    } else if (Options.empty()) {
-      Options += Argv[i];
-    } else {
-      PositionalArgs.push_back(Argv[i]);
+        PositionalArgs.push_back(*ArgIt);
+      continue;
     }
+
+    if (strcmp(*ArgIt, "-M") == 0) {
+      MRI = true;
+      continue;
+    }
+
+    Match = matchFlagWithArg("format", ArgIt, Argv);
+    if (Match) {
+      FormatType = StringSwitch<Format>(Match)
+                       .Case("default", Default)
+                       .Case("gnu", GNU)
+                       .Case("darwin", DARWIN)
+                       .Case("bsd", BSD)
+                       .Default(Unknown);
+      if (FormatType == Unknown)
+        fail(std::string("Invalid format ") + Match);
+      continue;
+    }
+
+    if (matchFlagWithArg("plugin", ArgIt, Argv) ||
+        matchFlagWithArg("rsp-quoting", ArgIt, Argv))
+      continue;
+
+    Options += *ArgIt + 1;
   }
+
   ArchiveOperation Operation = parseCommandLine();
   return performOperation(Operation, nullptr);
 }
diff --git a/llvm/tools/llvm-gsym/llvm-gsymutil.cpp b/llvm/tools/llvm-gsym/llvm-gsymutil.cpp
index c7d6cf33da67a6..a3be9e3149dbe9 100644
--- a/llvm/tools/llvm-gsym/llvm-gsymutil.cpp
+++ b/llvm/tools/llvm-gsym/llvm-gsymutil.cpp
@@ -179,7 +179,8 @@ static bool filterArch(MachOObjectFile &Obj) {
   if (ArchFilters.empty())
     return true;
 
-  StringRef ObjArch = Obj.getArchTriple().getArchName();
+  Triple ObjTriple(Obj.getArchTriple());
+  StringRef ObjArch = ObjTriple.getArchName();
 
   for (auto Arch : ArchFilters) {
     // Match name.
@@ -350,7 +351,8 @@ static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
   error(Filename, errorToErrorCode(BinOrErr.takeError()));
 
   if (auto *Obj = dyn_cast<ObjectFile>(BinOrErr->get())) {
-    auto ArchName = Obj->makeTriple().getArchName();
+    Triple ObjTriple(Obj->makeTriple());
+    auto ArchName = ObjTriple.getArchName();
     outs() << "Output file (" << ArchName << "): " << OutFile << "\n";
     if (auto Err = handleObjectFile(*Obj, OutFile.c_str()))
       return Err;
@@ -374,7 +376,8 @@ static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
 
     // Now handle each architecture we need to convert.
     for (auto &Obj: FilterObjs) {
-      auto ArchName = Obj->getArchTriple().getArchName();
+      Triple ObjTriple(Obj->getArchTriple());
+      auto ArchName = ObjTriple.getArchName();
       std::string ArchOutFile(OutFile);
       // If we are only handling a single architecture, then we will use the
       // normal output file. If we are handling multiple architectures append
diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp
index 08c3587a821d5e..180457bb6d91eb 100644
--- a/llvm/tools/obj2yaml/elf2yaml.cpp
+++ b/llvm/tools/obj2yaml/elf2yaml.cpp
@@ -55,6 +55,7 @@ class ELFDumper {
                     std::vector<ELFYAML::Symbol> &Symbols);
   Error dumpSymbol(const Elf_Sym *Sym, const Elf_Shdr *SymTab,
                    StringRef StrTable, ELFYAML::Symbol &S);
+  Expected<std::vector<std::unique_ptr<ELFYAML::Chunk>>> dumpSections();
   Error dumpCommonSection(const Elf_Shdr *Shdr, ELFYAML::Section &S);
   Error dumpCommonRelocationSection(const Elf_Shdr *Shdr,
                                     ELFYAML::RelocationSection &S);
@@ -228,26 +229,53 @@ template <class ELFT> Expected<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
       return std::move(E);
   }
 
+  if (Expected<std::vector<std::unique_ptr<ELFYAML::Chunk>>> ChunksOrErr =
+          dumpSections())
+    Y->Chunks = std::move(*ChunksOrErr);
+  else
+    return ChunksOrErr.takeError();
+
+  return Y.release();
+}
+
+template <class ELFT>
+Expected<std::vector<std::unique_ptr<ELFYAML::Chunk>>>
+ELFDumper<ELFT>::dumpSections() {
+  std::vector<std::unique_ptr<ELFYAML::Chunk>> Ret;
+
   for (const Elf_Shdr &Sec : Sections) {
     switch (Sec.sh_type) {
     case ELF::SHT_DYNAMIC: {
       Expected<ELFYAML::DynamicSection *> SecOrErr = dumpDynamicSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_STRTAB:
     case ELF::SHT_SYMTAB:
-    case ELF::SHT_DYNSYM:
-      // Do not dump these sections.
+    case ELF::SHT_DYNSYM: {
+      // The contents of these sections are described by other parts of the YAML
+      // file. We still dump them so that their positions in the section header
+      // table are correctly recorded. We only dump allocatable section because
+      // their positions and addresses are important, e.g. for creating program
+      // headers. Some sections, like .symtab or .strtab normally are not
+      // allocatable and do not have virtual addresses. We want to avoid noise
+      // in the YAML output and assume that they are placed at the end.
+      if (Sec.sh_flags & ELF::SHF_ALLOC) {
+        auto S = std::make_unique<ELFYAML::RawContentSection>();
+        if (Error E = dumpCommonSection(&Sec, *S.get()))
+          return std::move(E);
+        Ret.emplace_back(std::move(S));
+      }
       break;
+    }
     case ELF::SHT_SYMTAB_SHNDX: {
       Expected<ELFYAML::SymtabShndxSection *> SecOrErr =
           dumpSymtabShndxSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_REL:
@@ -255,84 +283,84 @@ template <class ELFT> Expected<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
       Expected<ELFYAML::RelocationSection *> SecOrErr = dumpRelocSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_RELR: {
       Expected<ELFYAML::RelrSection *> SecOrErr = dumpRelrSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_GROUP: {
       Expected<ELFYAML::Group *> GroupOrErr = dumpGroup(&Sec);
       if (!GroupOrErr)
         return GroupOrErr.takeError();
-      Y->Chunks.emplace_back(*GroupOrErr);
+      Ret.emplace_back(*GroupOrErr);
       break;
     }
     case ELF::SHT_MIPS_ABIFLAGS: {
       Expected<ELFYAML::MipsABIFlags *> SecOrErr = dumpMipsABIFlags(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_NOBITS: {
       Expected<ELFYAML::NoBitsSection *> SecOrErr = dumpNoBitsSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_NOTE: {
       Expected<ELFYAML::NoteSection *> SecOrErr = dumpNoteSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_HASH: {
       Expected<ELFYAML::HashSection *> SecOrErr = dumpHashSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_GNU_HASH: {
       Expected<ELFYAML::GnuHashSection *> SecOrErr = dumpGnuHashSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_GNU_verdef: {
       Expected<ELFYAML::VerdefSection *> SecOrErr = dumpVerdefSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_GNU_versym: {
       Expected<ELFYAML::SymverSection *> SecOrErr = dumpSymverSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_GNU_verneed: {
       Expected<ELFYAML::VerneedSection *> SecOrErr = dumpVerneedSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_LLVM_ADDRSIG: {
       Expected<ELFYAML::AddrsigSection *> SecOrErr = dumpAddrsigSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_LLVM_LINKER_OPTIONS: {
@@ -340,7 +368,7 @@ template <class ELFT> Expected<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
           dumpLinkerOptionsSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_LLVM_DEPENDENT_LIBRARIES: {
@@ -348,7 +376,7 @@ template <class ELFT> Expected<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
           dumpDependentLibrariesSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_LLVM_CALL_GRAPH_PROFILE: {
@@ -356,7 +384,7 @@ template <class ELFT> Expected<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
           dumpCallGraphProfileSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
       break;
     }
     case ELF::SHT_NULL: {
@@ -378,7 +406,7 @@ template <class ELFT> Expected<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
         if (!SpecialSecOrErr)
           return SpecialSecOrErr.takeError();
         if (*SpecialSecOrErr) {
-          Y->Chunks.emplace_back(*SpecialSecOrErr);
+          Ret.emplace_back(*SpecialSecOrErr);
           break;
         }
       }
@@ -387,12 +415,11 @@ template <class ELFT> Expected<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
           dumpContentSection(&Sec);
       if (!SecOrErr)
         return SecOrErr.takeError();
-      Y->Chunks.emplace_back(*SecOrErr);
+      Ret.emplace_back(*SecOrErr);
     }
     }
   }
-
-  return Y.release();
+  return std::move(Ret);
 }
 
 template <class ELFT>
diff --git a/llvm/unittests/Analysis/CMakeLists.txt b/llvm/unittests/Analysis/CMakeLists.txt
index d66dd39c601367..6fabd940a74af7 100644
--- a/llvm/unittests/Analysis/CMakeLists.txt
+++ b/llvm/unittests/Analysis/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_unittest(AnalysisTests
   LazyCallGraphTest.cpp
   LoadsTest.cpp
   LoopInfoTest.cpp
+  LoopNestTest.cpp
   MemoryBuiltinsTest.cpp
   MemorySSATest.cpp
   OrderedInstructionsTest.cpp
diff --git a/llvm/unittests/Analysis/LoopNestTest.cpp b/llvm/unittests/Analysis/LoopNestTest.cpp
new file mode 100644
index 00000000000000..4e31b1f2e9046c
--- /dev/null
+++ b/llvm/unittests/Analysis/LoopNestTest.cpp
@@ -0,0 +1,194 @@
+//===- LoopNestTest.cpp - LoopNestAnalysis unit tests ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopNestAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+/// Build the loop nest analysis for a loop nest and run the given test \p Test.
+static void runTest(
+    Module &M, StringRef FuncName,
+    function_ref<void(Function &F, LoopInfo &LI, ScalarEvolution &SE)> Test) {
+  auto *F = M.getFunction(FuncName);
+  ASSERT_NE(F, nullptr) << "Could not find " << FuncName;
+
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI(TLII);
+  AssumptionCache AC(*F);
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  ScalarEvolution SE(*F, TLI, AC, DT, LI);
+
+  Test(*F, LI, SE);
+}
+
+static std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
+                                              const char *ModuleStr) {
+  SMDiagnostic Err;
+  return parseAssemblyString(ModuleStr, Err, Context);
+}
+
+TEST(LoopNestTest, PerfectLoopNest) {
+  const char *ModuleStr =
+    "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n"
+    "define void @foo(i64 signext %nx, i64 signext %ny) {\n"
+    "entry:\n"
+    "  br label %for.outer\n"
+    "for.outer:\n"
+    "  %i = phi i64 [ 0, %entry ], [ %inc13, %for.outer.latch ]\n"
+    "  %cmp21 = icmp slt i64 0, %ny\n"
+    "  br i1 %cmp21, label %for.inner.preheader, label %for.outer.latch\n"
+    "for.inner.preheader:\n"
+    "  br label %for.inner\n"
+    "for.inner:\n"
+    "  %j = phi i64 [ 0, %for.inner.preheader ], [ %inc, %for.inner.latch ]\n"
+    "  br label %for.inner.latch\n"
+    "for.inner.latch:\n"
+    "  %inc = add nsw i64 %j, 1\n"
+    "  %cmp2 = icmp slt i64 %inc, %ny\n"
+    "  br i1 %cmp2, label %for.inner, label %for.inner.exit\n"
+    "for.inner.exit:\n"
+    "  br label %for.outer.latch\n"
+    "for.outer.latch:\n"
+    "  %inc13 = add nsw i64 %i, 1\n"
+    "  %cmp = icmp slt i64 %inc13, %nx\n"
+    "  br i1 %cmp, label %for.outer, label %for.outer.exit\n"
+    "for.outer.exit:\n"
+    "  br label %for.end\n"
+    "for.end:\n"
+    "  ret void\n"
+    "}\n";
+
+  LLVMContext Context;
+  std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleStr);
+
+  runTest(*M, "foo", [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+    Function::iterator FI = F.begin();
+    // Skip the first basic block (entry), get to the outer loop header.
+    BasicBlock *Header = &*(++FI);
+    assert(Header->getName() == "for.outer");
+    Loop *L = LI.getLoopFor(Header);
+    EXPECT_NE(L, nullptr);
+
+    LoopNest LN(*L, SE);
+    EXPECT_TRUE(LN.areAllLoopsSimplifyForm());
+
+    // Ensure that we can identify the outermost loop in the nest.
+    const Loop &OL = LN.getOutermostLoop();
+    EXPECT_EQ(OL.getName(), "for.outer");
+
+    // Ensure that we can identify the innermost loop in the nest.
+    const Loop *IL = LN.getInnermostLoop();
+    EXPECT_NE(IL, nullptr);
+    EXPECT_EQ(IL->getName(), "for.inner");
+
+    // Ensure the loop nest is recognized as having 2 loops.
+    const ArrayRef<Loop*> Loops = LN.getLoops();
+    EXPECT_EQ(Loops.size(), 2ull);
+
+    // Ensure the loop nest is recognized as perfect in its entirety.
+    const SmallVector<LoopVectorTy, 4> &PLV = LN.getPerfectLoops(SE);
+    EXPECT_EQ(PLV.size(), 1ull);
+    EXPECT_EQ(PLV.front().size(), 2ull);
+
+    // Ensure the nest depth and perfect nest depth are computed correctly.
+    EXPECT_EQ(LN.getNestDepth(), 2u);
+    EXPECT_EQ(LN.getMaxPerfectDepth(), 2u);
+  });
+}
+
+TEST(LoopNestTest, ImperfectLoopNest) {
+  const char *ModuleStr =
+      "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n"
+      "define void @foo(i32 signext %nx, i32 signext %ny, i32 signext %nk) {\n"
+      "entry:\n"
+      "  br label %loop.i\n"
+      "loop.i:\n"
+      "  %i = phi i32 [ 0, %entry ], [ %inci, %for.inci ]\n"
+      "  %cmp21 = icmp slt i32 0, %ny\n"
+      "  br i1 %cmp21, label %loop.j.preheader, label %for.inci\n"
+      "loop.j.preheader:\n"
+      "  br label %loop.j\n"
+      "loop.j:\n"
+      "  %j = phi i32 [ %incj, %for.incj ], [ 0, %loop.j.preheader ]\n"
+      "  %cmp22 = icmp slt i32 0, %nk\n"
+      "  br i1 %cmp22, label %loop.k.preheader, label %for.incj\n"
+      "loop.k.preheader:\n"
+      "  call void @bar()\n"
+      "  br label %loop.k\n"
+      "loop.k:\n"
+      "  %k = phi i32 [ %inck, %for.inck ], [ 0, %loop.k.preheader ]\n"
+      "  br label %for.inck\n"
+      "for.inck:\n"
+      "  %inck = add nsw i32 %k, 1\n"
+      "  %cmp5 = icmp slt i32 %inck, %nk\n"
+      "  br i1 %cmp5, label %loop.k, label %for.incj.loopexit\n"
+      "for.incj.loopexit:\n"
+      "  br label %for.incj\n"
+      "for.incj:\n"
+      "  %incj = add nsw i32 %j, 1\n"
+      "  %cmp2 = icmp slt i32 %incj, %ny\n"
+      "  br i1 %cmp2, label %loop.j, label %for.inci.loopexit\n"
+      "for.inci.loopexit:\n"
+      "  br label %for.inci\n"
+      "for.inci:\n"
+      "  %inci = add nsw i32 %i, 1\n"
+      "  %cmp = icmp slt i32 %inci, %nx\n"
+      "  br i1 %cmp, label %loop.i, label %loop.i.end\n"
+      "loop.i.end:\n"
+      "  ret void\n"
+      "}\n"
+      "declare void @bar()\n";
+
+  LLVMContext Context;
+  std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleStr);
+
+  runTest(*M, "foo", [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+    Function::iterator FI = F.begin();
+    // Skip the first basic block (entry), get to the outermost loop header.
+    BasicBlock *Header = &*(++FI);
+    assert(Header->getName() == "loop.i");
+    Loop *L = LI.getLoopFor(Header);
+    EXPECT_NE(L, nullptr);
+
+    LoopNest LN(*L, SE);
+    EXPECT_TRUE(LN.areAllLoopsSimplifyForm());
+
+    dbgs() << "LN: " << LN << "\n";
+
+    // Ensure that we can identify the outermost loop in the nest.
+    const Loop &OL = LN.getOutermostLoop();
+    EXPECT_EQ(OL.getName(), "loop.i");
+
+    // Ensure that we can identify the innermost loop in the nest.
+    const Loop *IL = LN.getInnermostLoop();
+    EXPECT_NE(IL, nullptr);
+    EXPECT_EQ(IL->getName(), "loop.k");
+
+    // Ensure the loop nest is recognized as having 3 loops.
+    const ArrayRef<Loop*> Loops = LN.getLoops();
+    EXPECT_EQ(Loops.size(), 3ull);
+
+    // Ensure the loop nest is recognized as having 2 separate perfect loops groups.
+    const SmallVector<LoopVectorTy, 4> &PLV = LN.getPerfectLoops(SE);
+    EXPECT_EQ(PLV.size(), 2ull);
+    EXPECT_EQ(PLV.front().size(), 2ull);
+    EXPECT_EQ(PLV.back().size(), 1ull);
+
+    // Ensure the nest depth and perfect nest depth are computed correctly.
+    EXPECT_EQ(LN.getNestDepth(), 3u);
+    EXPECT_EQ(LN.getMaxPerfectDepth(), 2u);
+  });
+}
+
diff --git a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
index 8072e05c7cea0c..6c4e42aa7e05f1 100644
--- a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
+++ b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
@@ -65,6 +65,20 @@ class ProfileSummaryInfoTest : public testing::Test {
         "  %y2 = phi i32 [0, %bb1], [1, %bb2] \n"
         "  ret i32 %y2\n"
         "}\n"
+        "define i32 @l(i32 %x) {{\n"
+        "bb0:\n"
+        "  %y1 = icmp eq i32 %x, 0 \n"
+        "  br i1 %y1, label %bb1, label %bb2, !prof !23 \n"
+        "bb1:\n"
+        "  %z1 = call i32 @g(i32 %x)\n"
+        "  br label %bb3\n"
+        "bb2:\n"
+        "  %z2 = call i32 @h(i32 %x)\n"
+        "  br label %bb3\n"
+        "bb3:\n"
+        "  %y2 = phi i32 [0, %bb1], [1, %bb2] \n"
+        "  ret i32 %y2\n"
+        "}\n"
         "!20 = !{{!\"function_entry_count\", i64 400}\n"
         "!21 = !{{!\"function_entry_count\", i64 1}\n"
         "!22 = !{{!\"function_entry_count\", i64 100}\n"
@@ -141,14 +155,26 @@ TEST_F(ProfileSummaryInfoTest, TestCommon) {
   EXPECT_FALSE(PSI.isHotCountNthPercentile(990000, 100));
   EXPECT_FALSE(PSI.isHotCountNthPercentile(990000, 2));
 
+  EXPECT_FALSE(PSI.isColdCountNthPercentile(990000, 400));
+  EXPECT_TRUE(PSI.isColdCountNthPercentile(990000, 100));
+  EXPECT_TRUE(PSI.isColdCountNthPercentile(990000, 2));
+
   EXPECT_TRUE(PSI.isHotCountNthPercentile(999999, 400));
   EXPECT_TRUE(PSI.isHotCountNthPercentile(999999, 100));
   EXPECT_FALSE(PSI.isHotCountNthPercentile(999999, 2));
 
+  EXPECT_FALSE(PSI.isColdCountNthPercentile(999999, 400));
+  EXPECT_FALSE(PSI.isColdCountNthPercentile(999999, 100));
+  EXPECT_TRUE(PSI.isColdCountNthPercentile(999999, 2));
+
   EXPECT_FALSE(PSI.isHotCountNthPercentile(10000, 400));
   EXPECT_FALSE(PSI.isHotCountNthPercentile(10000, 100));
   EXPECT_FALSE(PSI.isHotCountNthPercentile(10000, 2));
 
+  EXPECT_TRUE(PSI.isColdCountNthPercentile(10000, 400));
+  EXPECT_TRUE(PSI.isColdCountNthPercentile(10000, 100));
+  EXPECT_TRUE(PSI.isColdCountNthPercentile(10000, 2));
+
   EXPECT_TRUE(PSI.isFunctionEntryHot(F));
   EXPECT_FALSE(PSI.isFunctionEntryHot(G));
   EXPECT_FALSE(PSI.isFunctionEntryHot(H));
@@ -177,16 +203,31 @@ TEST_F(ProfileSummaryInfoTest, InstrProf) {
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB2, &BFI));
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(990000, BB3, &BFI));
 
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, &BB0, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB1, &BFI));
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(990000, BB2, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB3, &BFI));
+
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(999900, &BB0, &BFI));
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(999900, BB1, &BFI));
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(999900, BB2, &BFI));
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(999900, BB3, &BFI));
 
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(999900, &BB0, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(999900, BB1, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(999900, BB2, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(999900, BB3, &BFI));
+
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(10000, &BB0, &BFI));
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(10000, BB1, &BFI));
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(10000, BB2, &BFI));
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(10000, BB3, &BFI));
 
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, &BB0, &BFI));
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB1, &BFI));
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB2, &BFI));
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB3, &BFI));
+
   CallSite CS1(BB1->getFirstNonPHI());
   auto *CI2 = BB2->getFirstNonPHI();
   CallSite CS2(CI2);
@@ -201,6 +242,31 @@ TEST_F(ProfileSummaryInfoTest, InstrProf) {
   EXPECT_FALSE(PSI.isHotCallSite(CS2, &BFI));
 }
 
+TEST_F(ProfileSummaryInfoTest, InstrProfNoFuncEntryCount) {
+  auto M = makeLLVMModule("InstrProf");
+  Function *F = M->getFunction("l");
+  ProfileSummaryInfo PSI = buildPSI(M.get());
+  EXPECT_TRUE(PSI.hasProfileSummary());
+  EXPECT_TRUE(PSI.hasInstrumentationProfile());
+
+  BasicBlock &BB0 = F->getEntryBlock();
+  BasicBlock *BB1 = BB0.getTerminator()->getSuccessor(0);
+  BasicBlock *BB2 = BB0.getTerminator()->getSuccessor(1);
+  BasicBlock *BB3 = BB1->getSingleSuccessor();
+
+  BlockFrequencyInfo BFI = buildBFI(*F);
+
+  // Without the entry count, all should return false.
+  EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, &BB0, &BFI));
+  EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB1, &BFI));
+  EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB2, &BFI));
+  EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB3, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, &BB0, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB1, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB2, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB3, &BFI));
+}
+
 TEST_F(ProfileSummaryInfoTest, SampleProf) {
   auto M = makeLLVMModule("SampleProfile");
   Function *F = M->getFunction("f");
@@ -224,16 +290,31 @@ TEST_F(ProfileSummaryInfoTest, SampleProf) {
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB2, &BFI));
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(990000, BB3, &BFI));
 
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, &BB0, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB1, &BFI));
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(990000, BB2, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB3, &BFI));
+
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(999900, &BB0, &BFI));
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(999900, BB1, &BFI));
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(999900, BB2, &BFI));
   EXPECT_TRUE(PSI.isHotBlockNthPercentile(999900, BB3, &BFI));
 
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(999900, &BB0, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(999900, BB1, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(999900, BB2, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(999900, BB3, &BFI));
+
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(10000, &BB0, &BFI));
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(10000, BB1, &BFI));
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(10000, BB2, &BFI));
   EXPECT_FALSE(PSI.isHotBlockNthPercentile(10000, BB3, &BFI));
 
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, &BB0, &BFI));
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB1, &BFI));
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB2, &BFI));
+  EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB3, &BFI));
+
   CallSite CS1(BB1->getFirstNonPHI());
   auto *CI2 = BB2->getFirstNonPHI();
   // Manually attach branch weights metadata to the call instruction.
@@ -250,6 +331,51 @@ TEST_F(ProfileSummaryInfoTest, SampleProf) {
   // weights that exceed the hot count threshold.
   CI2->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights({400}));
   EXPECT_TRUE(PSI.isHotCallSite(CS2, &BFI));
+
+  {
+    Function *F = M->getFunction("l");
+    BlockFrequencyInfo BFI = buildBFI(*F);
+    BasicBlock &BB0 = F->getEntryBlock();
+    BasicBlock *BB1 = BB0.getTerminator()->getSuccessor(0);
+    BasicBlock *BB2 = BB0.getTerminator()->getSuccessor(1);
+    BasicBlock *BB3 = BB1->getSingleSuccessor();
+
+    // Without the entry count, all should return false.
+    EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, &BB0, &BFI));
+    EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB1, &BFI));
+    EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB2, &BFI));
+    EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB3, &BFI));
+
+    EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, &BB0, &BFI));
+    EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB1, &BFI));
+    EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB2, &BFI));
+    EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB3, &BFI));
+  }
+}
+
+TEST_F(ProfileSummaryInfoTest, SampleProfNoFuncEntryCount) {
+  auto M = makeLLVMModule("SampleProfile");
+  Function *F = M->getFunction("l");
+  ProfileSummaryInfo PSI = buildPSI(M.get());
+  EXPECT_TRUE(PSI.hasProfileSummary());
+  EXPECT_TRUE(PSI.hasSampleProfile());
+
+  BasicBlock &BB0 = F->getEntryBlock();
+  BasicBlock *BB1 = BB0.getTerminator()->getSuccessor(0);
+  BasicBlock *BB2 = BB0.getTerminator()->getSuccessor(1);
+  BasicBlock *BB3 = BB1->getSingleSuccessor();
+
+  BlockFrequencyInfo BFI = buildBFI(*F);
+
+  // Without the entry count, all should return false.
+  EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, &BB0, &BFI));
+  EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB1, &BFI));
+  EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB2, &BFI));
+  EXPECT_FALSE(PSI.isHotBlockNthPercentile(990000, BB3, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, &BB0, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB1, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB2, &BFI));
+  EXPECT_FALSE(PSI.isColdBlockNthPercentile(990000, BB3, &BFI));
 }
 
 } // end anonymous namespace
diff --git a/llvm/unittests/Support/CompressionTest.cpp b/llvm/unittests/Support/CompressionTest.cpp
index 51723898e950d0..cc7be431b62bc3 100644
--- a/llvm/unittests/Support/CompressionTest.cpp
+++ b/llvm/unittests/Support/CompressionTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 
 namespace {
 
-#if LLVM_ENABLE_ZLIB
+#if LLVM_ENABLE_ZLIB == 1 && HAVE_LIBZ
 
 void TestZlibCompression(StringRef Input, int Level) {
   SmallString<32> Compressed;
diff --git a/llvm/unittests/Transforms/Utils/KnowledgeRetentionTest.cpp b/llvm/unittests/Transforms/Utils/KnowledgeRetentionTest.cpp
index 5c84a25745e594..08f2c6441645bb 100644
--- a/llvm/unittests/Transforms/Utils/KnowledgeRetentionTest.cpp
+++ b/llvm/unittests/Transforms/Utils/KnowledgeRetentionTest.cpp
@@ -41,7 +41,7 @@ static void RunTest(
   }
 }
 
-void AssertMatchesExactlyAttributes(CallInst *Assume, Value *WasOn,
+static void AssertMatchesExactlyAttributes(CallInst *Assume, Value *WasOn,
                                     StringRef AttrToMatch) {
   Regex Reg(AttrToMatch);
   SmallVector<StringRef, 1> Matches;
@@ -57,7 +57,7 @@ void AssertMatchesExactlyAttributes(CallInst *Assume, Value *WasOn,
   }
 }
 
-void AssertHasTheRightValue(CallInst *Assume, Value *WasOn,
+static void AssertHasTheRightValue(CallInst *Assume, Value *WasOn,
                             Attribute::AttrKind Kind, unsigned Value, bool Both,
                             AssumeQuery AQ = AssumeQuery::Highest) {
   if (!Both) {
@@ -80,7 +80,7 @@ void AssertHasTheRightValue(CallInst *Assume, Value *WasOn,
   }
 }
 
-TEST(AssumeQueryAPI, Basic) {
+TEST(AssumeQueryAPI, hasAttributeInAssume) {
   StringRef Head =
       "declare void @llvm.assume(i1)\n"
       "declare void @func(i32*, i32*)\n"
@@ -216,3 +216,174 @@ TEST(AssumeQueryAPI, Basic) {
       }));
   RunTest(Head, Tail, Tests);
 }
+
+static void AssertFindExactlyAttributes(RetainedKnowledgeMap &Map, Value *WasOn,
+                                 StringRef AttrToMatch) {
+  Regex Reg(AttrToMatch);
+  SmallVector<StringRef, 1> Matches;
+  for (StringRef Attr : {
+#define GET_ATTR_NAMES
+#define ATTRIBUTE_ENUM(ENUM_NAME, DISPLAY_NAME) StringRef(#DISPLAY_NAME),
+#include "llvm/IR/Attributes.inc"
+       }) {
+    bool ShouldHaveAttr = Reg.match(Attr, &Matches) && Matches[0] == Attr;
+
+    if (ShouldHaveAttr != (Map.find(RetainedKnowledgeKey{WasOn, Attribute::getAttrKindFromName(Attr)}) != Map.end())) {
+      ASSERT_TRUE(false);
+    }
+  }
+}
+
+static void AssertMapHasRightValue(RetainedKnowledgeMap &Map,
+                                   RetainedKnowledgeKey Key, MinMax MM) {
+  auto LookupIt = Map.find(Key);
+  ASSERT_TRUE(LookupIt != Map.end());
+  ASSERT_TRUE(LookupIt->second.Min == MM.Min);
+  ASSERT_TRUE(LookupIt->second.Max == MM.Max);
+}
+
+TEST(AssumeQueryAPI, fillMapFromAssume) {
+    StringRef Head =
+      "declare void @llvm.assume(i1)\n"
+      "declare void @func(i32*, i32*)\n"
+      "declare void @func1(i32*, i32*, i32*, i32*)\n"
+      "declare void @func_many(i32*) \"no-jump-tables\" nounwind "
+      "\"less-precise-fpmad\" willreturn norecurse\n"
+      "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3) {\n";
+  StringRef Tail = "ret void\n"
+                   "}";
+  std::vector<std::pair<StringRef, llvm::function_ref<void(Instruction *)>>>
+      Tests;
+  Tests.push_back(std::make_pair(
+      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
+      "8 noalias %P1)\n",
+      [](Instruction *I) {
+        CallInst *Assume = BuildAssumeFromInst(I);
+        Assume->insertBefore(I);
+
+        RetainedKnowledgeMap Map;
+        fillMapFromAssume(*Assume, Map);
+        AssertFindExactlyAttributes(Map, I->getOperand(0),
+                                       "(nonnull|align|dereferenceable)");
+        AssertFindExactlyAttributes(Map, I->getOperand(1),
+                                       "(noalias|align)");
+        AssertMapHasRightValue(
+            Map, {I->getOperand(0), Attribute::Dereferenceable}, {16, 16});
+        AssertMapHasRightValue(Map, {I->getOperand(0), Attribute::Alignment},
+                               {4, 4});
+        AssertMapHasRightValue(Map, {I->getOperand(0), Attribute::Alignment},
+                               {4, 4});
+      }));
+  Tests.push_back(std::make_pair(
+      "call void @func1(i32* nonnull align 32 dereferenceable(48) %P, i32* "
+      "nonnull "
+      "align 8 dereferenceable(28) %P, i32* nonnull align 64 "
+      "dereferenceable(4) "
+      "%P, i32* nonnull align 16 dereferenceable(12) %P)\n",
+      [](Instruction *I) {
+        CallInst *Assume = BuildAssumeFromInst(I);
+        Assume->insertBefore(I);
+
+        RetainedKnowledgeMap Map;
+        fillMapFromAssume(*Assume, Map);
+
+        AssertFindExactlyAttributes(Map, I->getOperand(0),
+                                       "(nonnull|align|dereferenceable)");
+        AssertFindExactlyAttributes(Map, I->getOperand(1),
+                                       "(nonnull|align|dereferenceable)");
+        AssertFindExactlyAttributes(Map, I->getOperand(2),
+                                       "(nonnull|align|dereferenceable)");
+        AssertFindExactlyAttributes(Map, I->getOperand(3),
+                                       "(nonnull|align|dereferenceable)");
+        AssertMapHasRightValue(
+            Map, {I->getOperand(0), Attribute::Dereferenceable}, {4, 48});
+        AssertMapHasRightValue(Map, {I->getOperand(0), Attribute::Alignment},
+                               {8, 64});
+      }));
+  Tests.push_back(std::make_pair(
+      "call void @func_many(i32* align 8 %P1) cold\n", [](Instruction *I) {
+        ShouldPreserveAllAttributes.setValue(true);
+        CallInst *Assume = BuildAssumeFromInst(I);
+        Assume->insertBefore(I);
+
+        RetainedKnowledgeMap Map;
+        fillMapFromAssume(*Assume, Map);
+
+        AssertFindExactlyAttributes(
+            Map, nullptr, "(nounwind|norecurse|willreturn|cold)");
+        ShouldPreserveAllAttributes.setValue(false);
+      }));
+  Tests.push_back(
+      std::make_pair("call void @llvm.assume(i1 true)\n", [](Instruction *I) {
+        RetainedKnowledgeMap Map;
+        fillMapFromAssume(*cast<CallInst>(I), Map);
+
+        AssertFindExactlyAttributes(Map, nullptr, "");
+        ASSERT_TRUE(Map.empty());
+      }));
+  Tests.push_back(std::make_pair(
+      "call void @func1(i32* readnone align 32 "
+      "dereferenceable(48) noalias %P, i32* "
+      "align 8 dereferenceable(28) %P1, i32* align 64 "
+      "dereferenceable(4) "
+      "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n",
+      [](Instruction *I) {
+        CallInst *Assume = BuildAssumeFromInst(I);
+        Assume->insertBefore(I);
+
+        RetainedKnowledgeMap Map;
+        fillMapFromAssume(*Assume, Map);
+
+        AssertFindExactlyAttributes(Map, I->getOperand(0),
+                                    "(readnone|align|dereferenceable|noalias)");
+        AssertFindExactlyAttributes(Map, I->getOperand(1),
+                                    "(align|dereferenceable)");
+        AssertFindExactlyAttributes(Map, I->getOperand(2),
+                                       "(align|dereferenceable)");
+        AssertFindExactlyAttributes(Map, I->getOperand(3),
+                                       "(nonnull|align|dereferenceable)");
+        AssertMapHasRightValue(Map, {I->getOperand(0), Attribute::Alignment},
+                               {32, 32});
+        AssertMapHasRightValue(
+            Map, {I->getOperand(0), Attribute::Dereferenceable}, {48, 48});
+        AssertMapHasRightValue(
+            Map, {I->getOperand(0), Attribute::NoAlias}, {0, 0});
+        AssertMapHasRightValue(
+            Map, {I->getOperand(1), Attribute::Dereferenceable}, {28, 28});
+        AssertMapHasRightValue(Map, {I->getOperand(1), Attribute::Alignment},
+                               {8, 8});
+        AssertMapHasRightValue(Map, {I->getOperand(2), Attribute::Alignment},
+                               {64, 64});
+        AssertMapHasRightValue(
+            Map, {I->getOperand(2), Attribute::Dereferenceable}, {4, 4});
+        AssertMapHasRightValue(Map, {I->getOperand(3), Attribute::Alignment},
+                               {16, 16});
+        AssertMapHasRightValue(
+            Map, {I->getOperand(3), Attribute::Dereferenceable}, {12, 12});
+      }));
+
+  /// Keep this test last as it modifies the function.
+  Tests.push_back(std::make_pair(
+      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
+      "8 noalias %P1)\n",
+      [](Instruction *I) {
+        CallInst *Assume = BuildAssumeFromInst(I);
+        Assume->insertBefore(I);
+
+        RetainedKnowledgeMap Map;
+        fillMapFromAssume(*Assume, Map);
+
+        Value *New = I->getFunction()->getArg(3);
+        Value *Old = I->getOperand(0);
+        AssertFindExactlyAttributes(Map, New, "");
+        AssertFindExactlyAttributes(Map, Old,
+                                       "(nonnull|align|dereferenceable)");
+        Old->replaceAllUsesWith(New);
+        Map.clear();
+        fillMapFromAssume(*Assume, Map);
+        AssertFindExactlyAttributes(Map, New,
+                                       "(nonnull|align|dereferenceable)");
+        AssertFindExactlyAttributes(Map, Old, "");
+      }));
+  RunTest(Head, Tail, Tests);
+}
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index e08ea0c701fe09..3a25620f744a6f 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -50,6 +50,7 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) {
   EXPECT_EQ(7u, VecBB->size());
   EXPECT_EQ(2u, VecBB->getNumPredecessors());
   EXPECT_EQ(2u, VecBB->getNumSuccessors());
+  EXPECT_EQ(&*Plan, VecBB->getPlan());
 
   auto Iter = VecBB->begin();
   VPInstruction *Phi = dyn_cast<VPInstruction>(&*Iter++);
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp
index 81ed3cee3d2a81..dccbe9c4cf6534 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp
@@ -102,6 +102,13 @@ TEST_F(VPlanPredicatorTest, BasicPredicatorTest) {
   EXPECT_EQ(InnerLoopLinSucc, OuterIf);
   EXPECT_EQ(OuterIfLinSucc, InnerIf);
   EXPECT_EQ(InnerIfLinSucc, InnerLoopLatch);
+
+  // Check that the containing VPlan is set correctly.
+  EXPECT_EQ(&*Plan, InnerLoopLinSucc->getPlan());
+  EXPECT_EQ(&*Plan, OuterIfLinSucc->getPlan());
+  EXPECT_EQ(&*Plan, InnerIfLinSucc->getPlan());
+  EXPECT_EQ(&*Plan, InnerIf->getPlan());
+  EXPECT_EQ(&*Plan, InnerLoopLatch->getPlan());
 }
 
 // Test generation of Not and Or during predication.
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 67936a83efaf68..855016a1248836 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -86,5 +86,95 @@ TEST(VPInstructionTest, moveAfter) {
   EXPECT_EQ(I3->getParent(), I4->getParent());
 }
 
+TEST(VPBasicBlockTest, getPlan) {
+  {
+    VPBasicBlock *VPBB1 = new VPBasicBlock();
+    VPBasicBlock *VPBB2 = new VPBasicBlock();
+    VPBasicBlock *VPBB3 = new VPBasicBlock();
+    VPBasicBlock *VPBB4 = new VPBasicBlock();
+
+    //     VPBB1
+    //     /   \
+    // VPBB2  VPBB3
+    //    \    /
+    //    VPBB4
+    VPBlockUtils::connectBlocks(VPBB1, VPBB2);
+    VPBlockUtils::connectBlocks(VPBB1, VPBB3);
+    VPBlockUtils::connectBlocks(VPBB2, VPBB4);
+    VPBlockUtils::connectBlocks(VPBB3, VPBB4);
+
+    VPlan Plan;
+    Plan.setEntry(VPBB1);
+
+    EXPECT_EQ(&Plan, VPBB1->getPlan());
+    EXPECT_EQ(&Plan, VPBB2->getPlan());
+    EXPECT_EQ(&Plan, VPBB3->getPlan());
+    EXPECT_EQ(&Plan, VPBB4->getPlan());
+  }
+
+  {
+    // Region block is entry into VPlan.
+    VPBasicBlock *R1BB1 = new VPBasicBlock();
+    VPBasicBlock *R1BB2 = new VPBasicBlock();
+    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
+    VPBlockUtils::connectBlocks(R1BB1, R1BB2);
+
+    VPlan Plan;
+    Plan.setEntry(R1);
+    EXPECT_EQ(&Plan, R1->getPlan());
+    EXPECT_EQ(&Plan, R1BB1->getPlan());
+    EXPECT_EQ(&Plan, R1BB2->getPlan());
+  }
+
+  {
+    // VPBasicBlock is the entry into the VPlan, followed by a region.
+    VPBasicBlock *R1BB1 = new VPBasicBlock();
+    VPBasicBlock *R1BB2 = new VPBasicBlock();
+    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
+    VPBlockUtils::connectBlocks(R1BB1, R1BB2);
+
+    VPBasicBlock *VPBB1 = new VPBasicBlock();
+    VPBlockUtils::connectBlocks(VPBB1, R1);
+
+    VPlan Plan;
+    Plan.setEntry(VPBB1);
+    EXPECT_EQ(&Plan, VPBB1->getPlan());
+    EXPECT_EQ(&Plan, R1->getPlan());
+    EXPECT_EQ(&Plan, R1BB1->getPlan());
+    EXPECT_EQ(&Plan, R1BB2->getPlan());
+  }
+
+  {
+    VPBasicBlock *R1BB1 = new VPBasicBlock();
+    VPBasicBlock *R1BB2 = new VPBasicBlock();
+    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
+    VPBlockUtils::connectBlocks(R1BB1, R1BB2);
+
+    VPBasicBlock *R2BB1 = new VPBasicBlock();
+    VPBasicBlock *R2BB2 = new VPBasicBlock();
+    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
+    VPBlockUtils::connectBlocks(R2BB1, R2BB2);
+
+    VPBasicBlock *VPBB1 = new VPBasicBlock();
+    VPBlockUtils::connectBlocks(VPBB1, R1);
+    VPBlockUtils::connectBlocks(VPBB1, R2);
+
+    VPBasicBlock *VPBB2 = new VPBasicBlock();
+    VPBlockUtils::connectBlocks(R1, VPBB2);
+    VPBlockUtils::connectBlocks(R2, VPBB2);
+
+    VPlan Plan;
+    Plan.setEntry(VPBB1);
+    EXPECT_EQ(&Plan, VPBB1->getPlan());
+    EXPECT_EQ(&Plan, R1->getPlan());
+    EXPECT_EQ(&Plan, R1BB1->getPlan());
+    EXPECT_EQ(&Plan, R1BB2->getPlan());
+    EXPECT_EQ(&Plan, R2->getPlan());
+    EXPECT_EQ(&Plan, R2BB1->getPlan());
+    EXPECT_EQ(&Plan, R2BB2->getPlan());
+    EXPECT_EQ(&Plan, VPBB2->getPlan());
+  }
+}
+
 } // namespace
 } // namespace llvm
diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h
index 1ff2faaa0e5245..55507cbca37ddd 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/CodeGenHwModes.h
@@ -12,6 +12,7 @@
 #define LLVM_UTILS_TABLEGEN_CODEGENHWMODES_H
 
 #include "llvm/ADT/StringMap.h"
+#include <cassert>
 #include <map>
 #include <string>
 #include <vector>
diff --git a/llvm/utils/TableGen/CodeGenInstruction.h b/llvm/utils/TableGen/CodeGenInstruction.h
index 1f08ce481a8989..af851a11676b12 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/CodeGenInstruction.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/SMLoc.h"
+#include <cassert>
 #include <string>
 #include <utility>
 #include <vector>
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
index ec2db1a9e2ded2..db789dcd880bf9 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
@@ -59,6 +59,7 @@ static_library("clangd") {
     "ClangdServer.cpp",
     "CodeComplete.cpp",
     "CodeCompletionStrings.cpp",
+    "CollectMacros.cpp",
     "CompileCommands.cpp",
     "Compiler.cpp",
     "Context.cpp",
diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn
index 2b5ecb166c249f..874891e89c81a2 100644
--- a/llvm/utils/gn/secondary/clang/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn
@@ -75,9 +75,9 @@ write_lit_config("lit_site_cfg") {
   }
 
   if (llvm_enable_zlib) {
-    extra_values += [ "LLVM_ENABLE_ZLIB=1" ]
+    extra_values += [ "HAVE_LIBZ=1" ]
   } else {
-    extra_values += [ "LLVM_ENABLE_ZLIB=0" ]  # Must be 0.
+    extra_values += [ "HAVE_LIBZ=0" ]  # Must be 0.
   }
 
   if (host_cpu == "x64") {
diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
index b0c690193a2e6f..c03399193babac 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
@@ -85,8 +85,8 @@ write_cmake_config("lit_common_configured") {
   }
 
   if (llvm_enable_zlib) {
-    values += [ "LLVM_ENABLE_ZLIB=1" ]
+    values += [ "HAVE_LIBZ=1" ]
   } else {
-    values += [ "LLVM_ENABLE_ZLIB=0" ]
+    values += [ "HAVE_LIBZ=0" ]
   }
 }
diff --git a/llvm/utils/gn/secondary/lld/test/BUILD.gn b/llvm/utils/gn/secondary/lld/test/BUILD.gn
index 1da191cba15170..5408ea8b6b24b3 100644
--- a/llvm/utils/gn/secondary/lld/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/test/BUILD.gn
@@ -49,9 +49,9 @@ write_lit_cfg("lit_site_cfg") {
   }
 
   if (llvm_enable_zlib) {
-    extra_values += [ "LLVM_ENABLE_ZLIB=1" ]
+    extra_values += [ "HAVE_LIBZ=1" ]
   } else {
-    extra_values += [ "LLVM_ENABLE_ZLIB=0" ]  # Must be 0.
+    extra_values += [ "HAVE_LIBZ=0" ]  # Must be 0.
   }
 
   if (current_cpu == "x64" || current_cpu == "arm64" ||
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 9f94540e4cff83..f8e1026475f5db 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -206,6 +206,7 @@ write_cmake_config("config") {
       "HAVE_ISATTY=",
       "HAVE_LIBPTHREAD=",
       "HAVE_PTHREAD_SETNAME_NP=",
+      "HAVE_LIBZ=",
       "HAVE_PREAD=",
       "HAVE_PTHREAD_GETSPECIFIC=",
       "HAVE_PTHREAD_H=",
@@ -224,6 +225,7 @@ write_cmake_config("config") {
       "HAVE_SYS_TIME_H=",
       "HAVE_TERMIOS_H=",
       "HAVE_UNISTD_H=",
+      "HAVE_ZLIB_H=",
       "HAVE__CHSIZE_S=1",
       "HAVE__UNWIND_BACKTRACE=",
       "stricmp=_stricmp",
@@ -242,6 +244,7 @@ write_cmake_config("config") {
       "HAVE_ISATTY=1",
       "HAVE_LIBPTHREAD=1",
       "HAVE_PTHREAD_SETNAME_NP=1",
+      "HAVE_LIBZ=1",
       "HAVE_PREAD=1",
       "HAVE_PTHREAD_GETSPECIFIC=1",
       "HAVE_PTHREAD_H=1",
@@ -260,6 +263,7 @@ write_cmake_config("config") {
       "HAVE_SYS_TIME_H=1",
       "HAVE_TERMIOS_H=1",
       "HAVE_UNISTD_H=1",
+      "HAVE_ZLIB_H=1",
       "HAVE__CHSIZE_S=",
       "HAVE__UNWIND_BACKTRACE=1",
       "stricmp=",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
index 328d819fd1f057..82bf99de1bc9b9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
@@ -68,6 +68,7 @@ static_library("Analysis") {
     "LoopAnalysisManager.cpp",
     "LoopCacheAnalysis.cpp",
     "LoopInfo.cpp",
+    "LoopNestAnalysis.cpp",
     "LoopPass.cpp",
     "LoopUnrollAnalyzer.cpp",
     "MemDepPrinter.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index f5e0b1222c826b..7aaaf867e11982 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -166,9 +166,9 @@ write_lit_config("lit_site_cfg") {
   }
 
   if (llvm_enable_zlib) {
-    extra_values += [ "LLVM_ENABLE_ZLIB=1" ]
+    extra_values += [ "HAVE_LIBZ=1" ]
   } else {
-    extra_values += [ "LLVM_ENABLE_ZLIB=0" ]  # Must be 0.
+    extra_values += [ "HAVE_LIBZ=0" ]  # Must be 0.
   }
 }
 
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
index 47bc50212651df..d73088885aa869 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
@@ -25,6 +25,7 @@ unittest("AnalysisTests") {
     "LazyCallGraphTest.cpp",
     "LoadsTest.cpp",
     "LoopInfoTest.cpp",
+    "LoopNestTest.cpp",
     "MemoryBuiltinsTest.cpp",
     "MemorySSATest.cpp",
     "OrderedInstructionsTest.cpp",
diff --git a/mlir/include/mlir/IR/Matchers.h b/mlir/include/mlir/IR/Matchers.h
index 6321e88c9c109e..d9979b8467ee0d 100644
--- a/mlir/include/mlir/IR/Matchers.h
+++ b/mlir/include/mlir/IR/Matchers.h
@@ -93,9 +93,8 @@ struct constant_int_op_binder {
       return false;
     auto type = op->getResult(0).getType();
 
-    if (type.isSignlessIntOrIndex()) {
+    if (type.isa<IntegerType>() || type.isa<IndexType>())
       return attr_value_binder<IntegerAttr>(bind_value).match(attr);
-    }
     if (type.isa<VectorType>() || type.isa<RankedTensorType>()) {
       if (auto splatAttr = attr.dyn_cast<SplatElementsAttr>()) {
         return attr_value_binder<IntegerAttr>(bind_value)
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 25c0238946a938..d431d4ebabf4c0 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -339,6 +339,30 @@ def I16 : I<16>;
 def I32 : I<32>;
 def I64 : I<64>;
 
+// Unsigned integer types.
+// Any unsigned integer type irrespective of its width.
+def AnyUnsignedInteger : Type<
+  CPred<"$_self.isUnsignedInteger()">, "unsigned integer">;
+
+// Unsigned integer type of a specific width.
+class UI<int width>
+    : Type<CPred<"$_self.isUnsignedInteger(" # width # ")">,
+                  width # "-bit unsigned integer">,
+      BuildableType<"$_builder.getIntegerType(" # width #
+                    ", /*isSigned=*/false)"> {
+  int bitwidth = width;
+}
+
+class UnsignedIntOfWidths<list<int> widths> :
+    AnyTypeOf<!foreach(w, widths, UI<w>),
+              StrJoinInt<widths, "/">.result # "-bit unsigned integer">;
+
+def UI1  : UI<1>;
+def UI8  : UI<8>;
+def UI16 : UI<16>;
+def UI32 : UI<32>;
+def UI64 : UI<64>;
+
 // Floating point types.
 
 // Any float type irrespective of its width.
diff --git a/mlir/include/mlir/IR/StandardTypes.h b/mlir/include/mlir/IR/StandardTypes.h
index 9bb9a8c06234d9..cd5ba07b689d7e 100644
--- a/mlir/include/mlir/IR/StandardTypes.h
+++ b/mlir/include/mlir/IR/StandardTypes.h
@@ -328,8 +328,9 @@ class TensorType : public ShapedType {
     // Note: Non standard/builtin types are allowed to exist within tensor
     // types. Dialects are expected to verify that tensor types have a valid
     // element type within that dialect.
-    return type.isSignlessIntOrFloat() || type.isa<ComplexType>() ||
-           type.isa<VectorType>() || type.isa<OpaqueType>() ||
+    return type.isa<ComplexType>() || type.isa<FloatType>() ||
+           type.isa<IntegerType>() || type.isa<OpaqueType>() ||
+           type.isa<VectorType>() ||
            (type.getKind() > Type::Kind::LAST_STANDARD_TYPE);
   }
 
diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h
index 40f1d481876996..eccc90cdae0c61 100644
--- a/mlir/include/mlir/IR/Types.h
+++ b/mlir/include/mlir/IR/Types.h
@@ -169,6 +169,9 @@ class Type {
   /// Return true of this is a signless integer or a float type.
   bool isSignlessIntOrFloat();
 
+  /// Return true of this is an integer(of any signedness) or a float type.
+  bool isIntOrFloat();
+
   /// Print the current type.
   void print(raw_ostream &os);
   void dump();
diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp
index b76c0c0770a36f..14635a1447358f 100644
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@@ -314,7 +314,7 @@ static unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
   auto elementType = memRefType.getElementType();
 
   unsigned sizeInBits;
-  if (elementType.isSignlessIntOrFloat()) {
+  if (elementType.isIntOrFloat()) {
     sizeInBits = elementType.getIntOrFloatBitWidth();
   } else {
     auto vectorType = elementType.cast<VectorType>();
@@ -358,7 +358,7 @@ Optional<uint64_t> mlir::getMemRefSizeInBytes(MemRefType memRefType) {
   if (!memRefType.hasStaticShape())
     return None;
   auto elementType = memRefType.getElementType();
-  if (!elementType.isSignlessIntOrFloat() && !elementType.isa<VectorType>())
+  if (!elementType.isIntOrFloat() && !elementType.isa<VectorType>())
     return None;
 
   uint64_t sizeInBytes = getMemRefEltSizeInBytes(memRefType);
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
index ed9400fc2ad06a..293d9351214718 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -572,6 +572,7 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
                                          gpu::LaunchOp launchOp,
                                          BlockAndValueMapping &cloningMap,
                                          SmallVectorImpl<Operation *> &worklist,
+                                         DenseMap<int, Value> &bounds,
                                          PatternRewriter &rewriter) {
   // TODO(herhut): Verify that this is a valid GPU mapping.
   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
@@ -631,22 +632,27 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
         // conditional. If the lower-bound is constant or defined before the
         // launch, we can use it in the launch bounds. Otherwise fail.
         if (!launchIndependent(lowerBound) &&
-            !isa<ConstantOp>(lowerBound.getDefiningOp()))
+            !isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp()))
           return failure();
         // The step must also be constant or defined outside of the loop nest.
-        if (!launchIndependent(step) && !isa<ConstantOp>(step.getDefiningOp()))
+        if (!launchIndependent(step) &&
+            !isa_and_nonnull<ConstantOp>(step.getDefiningOp()))
           return failure();
         // If the upper-bound is constant or defined before the launch, we can
         // use it in the launch bounds directly. Otherwise try derive a bound.
-        bool boundIsPrecise = launchIndependent(upperBound) ||
-                              isa<ConstantOp>(upperBound.getDefiningOp());
+        bool boundIsPrecise =
+            launchIndependent(upperBound) ||
+            isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp());
         {
           PatternRewriter::InsertionGuard guard(rewriter);
           rewriter.setInsertionPoint(launchOp);
           if (!boundIsPrecise) {
             upperBound = deriveStaticUpperBound(upperBound, rewriter);
-            if (!upperBound)
-              return failure();
+            if (!upperBound) {
+              return parallelOp.emitOpError()
+                     << "cannot derive loop-invariant upper bound for number "
+                        "of iterations";
+            }
           }
           // Compute the number of iterations needed. We compute this as an
           // affine expression ceilDiv (upperBound - lowerBound) step. We use
@@ -654,8 +660,8 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
           AffineMap stepMap =
               AffineMap::get(0, 3,
                              ((rewriter.getAffineSymbolExpr(0) -
-                              rewriter.getAffineSymbolExpr(1)).ceilDiv(
-                                  rewriter.getAffineSymbolExpr(2))));
+                               rewriter.getAffineSymbolExpr(1))
+                                  .ceilDiv(rewriter.getAffineSymbolExpr(2))));
           Value launchBound = rewriter.create<AffineApplyOp>(
               loc, annotation.boundMap.compose(stepMap),
               ValueRange{
@@ -664,7 +670,12 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
                   ensureLaunchIndependent(
                       cloningMap.lookupOrDefault(lowerBound)),
                   ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
-          launchOp.setOperand(annotation.processor, launchBound);
+          if (bounds.find(annotation.processor) != bounds.end()) {
+            return parallelOp.emitOpError()
+                   << "cannot redefine the bound for processor "
+                   << annotation.processor;
+          }
+          bounds[annotation.processor] = launchBound;
         }
         if (!boundIsPrecise) {
           // We are using an approximation, create a surrounding conditional.
@@ -746,9 +757,10 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
   rewriter.setInsertionPointToStart(&launchOp.body().front());
 
   BlockAndValueMapping cloningMap;
+  llvm::DenseMap<int, Value> launchBounds;
   SmallVector<Operation *, 16> worklist;
   if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
-                                 rewriter)))
+                                 launchBounds, rewriter)))
     return matchFailure();
 
   // Whether we have seen any side-effects. Reset when leaving an inner scope.
@@ -770,8 +782,9 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
       // A nested loop.parallel needs insertion of code to compute indices.
       // Insert that now. This will also update the worklist with the loops
       // body.
-      processParallelLoop(nestedParallel, launchOp, cloningMap, worklist,
-                          rewriter);
+      if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
+                                     worklist, launchBounds, rewriter)))
+        return matchFailure();
     } else if (op == launchOp.getOperation()) {
       // Found our sentinel value. We have finished the operations from one
       // nesting level, pop one level back up.
@@ -791,6 +804,11 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
     }
   }
 
+  // Now that we succeeded creating the launch operation, also update the
+  // bounds.
+  for (auto bound : launchBounds)
+    launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));
+
   rewriter.eraseOp(parallelOp);
   return matchSuccess();
 }
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 140f533e0b1558..ac2648846b2436 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -1372,17 +1372,18 @@ void ModulePrinter::printAttribute(Attribute attr,
 
 /// Print the integer element of the given DenseElementsAttr at 'index'.
 static void printDenseIntElement(DenseElementsAttr attr, raw_ostream &os,
-                                 unsigned index) {
+                                 unsigned index, bool isSigned) {
   APInt value = *std::next(attr.int_value_begin(), index);
   if (value.getBitWidth() == 1)
     os << (value.getBoolValue() ? "true" : "false");
   else
-    value.print(os, /*isSigned=*/true);
+    value.print(os, isSigned);
 }
 
 /// Print the float element of the given DenseElementsAttr at 'index'.
 static void printDenseFloatElement(DenseElementsAttr attr, raw_ostream &os,
-                                   unsigned index) {
+                                   unsigned index, bool isSigned) {
+  assert(isSigned && "floating point values are always signed");
   APFloat value = *std::next(attr.float_value_begin(), index);
   printFloatValue(value, os);
 }
@@ -1392,6 +1393,7 @@ void ModulePrinter::printDenseElementsAttr(DenseElementsAttr attr,
   auto type = attr.getType();
   auto shape = type.getShape();
   auto rank = type.getRank();
+  bool isSigned = !type.getElementType().isUnsignedInteger();
 
   // The function used to print elements of this attribute.
   auto printEltFn = type.getElementType().isa<IntegerType>()
@@ -1400,7 +1402,7 @@ void ModulePrinter::printDenseElementsAttr(DenseElementsAttr attr,
 
   // Special case for 0-d and splat tensors.
   if (attr.isSplat()) {
-    printEltFn(attr, os, 0);
+    printEltFn(attr, os, 0, isSigned);
     return;
   }
 
@@ -1452,7 +1454,7 @@ void ModulePrinter::printDenseElementsAttr(DenseElementsAttr attr,
     while (openBrackets++ < rank)
       os << '[';
     openBrackets = rank;
-    printEltFn(attr, os, idx);
+    printEltFn(attr, os, idx, isSigned);
     bumpCounter();
   }
   while (openBrackets-- > 0)
diff --git a/mlir/lib/IR/Attributes.cpp b/mlir/lib/IR/Attributes.cpp
index 5beb12a59940bc..4526d7dc10be79 100644
--- a/mlir/lib/IR/Attributes.cpp
+++ b/mlir/lib/IR/Attributes.cpp
@@ -608,7 +608,7 @@ DenseElementsAttr::FloatElementIterator::FloatElementIterator(
 
 DenseElementsAttr DenseElementsAttr::get(ShapedType type,
                                          ArrayRef<Attribute> values) {
-  assert(type.getElementType().isSignlessIntOrFloat() &&
+  assert(type.getElementType().isIntOrFloat() &&
          "expected int or float element type");
   assert(hasSameElementsOrSplat(type, values));
 
diff --git a/mlir/lib/IR/StandardTypes.cpp b/mlir/lib/IR/StandardTypes.cpp
index 30d5bbcc7b3ced..774f80a46de3a8 100644
--- a/mlir/lib/IR/StandardTypes.cpp
+++ b/mlir/lib/IR/StandardTypes.cpp
@@ -84,6 +84,8 @@ bool Type::isSignlessIntOrFloat() {
   return isSignlessInteger() || isa<FloatType>();
 }
 
+bool Type::isIntOrFloat() { return isa<IntegerType>() || isa<FloatType>(); }
+
 //===----------------------------------------------------------------------===//
 // Integer Type
 //===----------------------------------------------------------------------===//
@@ -147,13 +149,10 @@ const llvm::fltSemantics &FloatType::getFloatSemantics() {
 }
 
 unsigned Type::getIntOrFloatBitWidth() {
-  assert(isSignlessIntOrFloat() && "only ints and floats have a bitwidth");
-  if (auto intType = dyn_cast<IntegerType>()) {
+  assert(isIntOrFloat() && "only integers and floats have a bitwidth");
+  if (auto intType = dyn_cast<IntegerType>())
     return intType.getWidth();
-  }
-
-  auto floatType = cast<FloatType>();
-  return floatType.getWidth();
+  return cast<FloatType>().getWidth();
 }
 
 //===----------------------------------------------------------------------===//
@@ -202,7 +201,7 @@ int64_t ShapedType::getSizeInBits() const {
          "cannot get the bit size of an aggregate with a dynamic shape");
 
   auto elementType = getElementType();
-  if (elementType.isSignlessIntOrFloat())
+  if (elementType.isIntOrFloat())
     return elementType.getIntOrFloatBitWidth() * getNumElements();
 
   // Tensors can have vectors and other tensors as elements, other shaped types
@@ -373,7 +372,7 @@ MemRefType MemRefType::getImpl(ArrayRef<int64_t> shape, Type elementType,
   auto *context = elementType.getContext();
 
   // Check that memref is formed from allowed types.
-  if (!elementType.isSignlessIntOrFloat() && !elementType.isa<VectorType>() &&
+  if (!elementType.isIntOrFloat() && !elementType.isa<VectorType>() &&
       !elementType.isa<ComplexType>())
     return emitOptionalError(location, "invalid memref element type"),
            MemRefType();
@@ -451,7 +450,7 @@ LogicalResult
 UnrankedMemRefType::verifyConstructionInvariants(Location loc, Type elementType,
                                                  unsigned memorySpace) {
   // Check that memref is formed from allowed types.
-  if (!elementType.isSignlessIntOrFloat() && !elementType.isa<VectorType>() &&
+  if (!elementType.isIntOrFloat() && !elementType.isa<VectorType>() &&
       !elementType.isa<ComplexType>())
     return emitError(loc, "invalid memref element type");
   return success();
diff --git a/mlir/lib/Parser/Parser.cpp b/mlir/lib/Parser/Parser.cpp
index 668fb694d8fd52..661bddf8107a0d 100644
--- a/mlir/lib/Parser/Parser.cpp
+++ b/mlir/lib/Parser/Parser.cpp
@@ -1102,7 +1102,7 @@ Type Parser::parseMemRefType() {
     return nullptr;
 
   // Check that memref is formed from allowed types.
-  if (!elementType.isSignlessIntOrFloat() && !elementType.isa<VectorType>() &&
+  if (!elementType.isIntOrFloat() && !elementType.isa<VectorType>() &&
       !elementType.isa<ComplexType>())
     return emitError(typeLoc, "invalid memref element type"), nullptr;
 
diff --git a/mlir/lib/Transforms/DialectConversion.cpp b/mlir/lib/Transforms/DialectConversion.cpp
index 8e1a9cc942bdcf..ed81b588875ca8 100644
--- a/mlir/lib/Transforms/DialectConversion.cpp
+++ b/mlir/lib/Transforms/DialectConversion.cpp
@@ -51,9 +51,11 @@ computeConversionSet(iterator_range<Region::iterator> region,
                                  : Optional<ConversionTarget::LegalOpDetails>();
       if (legalityInfo && legalityInfo->isRecursivelyLegal)
         continue;
-      for (auto &region : op.getRegions())
-        computeConversionSet(region.getBlocks(), region.getLoc(), toConvert,
-                             target);
+      for (auto &region : op.getRegions()) {
+        if (failed(computeConversionSet(region.getBlocks(), region.getLoc(),
+                                        toConvert, target)))
+          return failure();
+      }
     }
 
     // Recurse to children that haven't been visited.
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp
index ef1af5d71aa840..bcb0c16ba77ba3 100644
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@@ -869,7 +869,7 @@ static unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
   auto elementType = memRefType.getElementType();
 
   unsigned sizeInBits;
-  if (elementType.isSignlessIntOrFloat()) {
+  if (elementType.isIntOrFloat()) {
     sizeInBits = elementType.getIntOrFloatBitWidth();
   } else {
     auto vectorType = elementType.cast<VectorType>();
diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
index 2a440a4456ba72..24ea0320f0ac36 100644
--- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure
+// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure
 
 // 2-d parallel loop mapped to block.y and block.x
 
@@ -299,3 +299,55 @@ module {
 // CHECK:           return
 // CHECK:         }
 // CHECK:       }
+
+// -----
+
+// Mapping to the same processor twice.
+
+func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
+                          %arg3 : index,
+                          %buf : memref<?x?xf32>,
+                          %res : memref<?x?xf32>) {
+  %four = constant 4 : index
+  // expected-error@+2 {{cannot redefine the bound for processor 1}}
+  // expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
+
+// -----
+
+// Loop with loop-variant upper bound.
+
+func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
+                                       %arg3 : index,
+                                       %buf : memref<?x?xf32>,
+                                       %res : memref<?x?xf32>) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  // expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    // expected-error@+1 {{cannot derive loop-invariant upper bound}}                                        
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
+                                            step (%one, %one)  {
+      %idx0 = addi %i0, %si0 : index
+      %idx1 = addi %i1, %si1 : index
+      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
+      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
+    } { mapping = [
+        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+        {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+      ] }
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir
index bec1fbd4aca6fe..3baf0642e8b05c 100644
--- a/mlir/test/IR/parser.mlir
+++ b/mlir/test/IR/parser.mlir
@@ -616,6 +616,9 @@ func @splattensorattr() -> () {
   // CHECK: "splatBoolTensor"() {bar = dense<false> : tensor<i1>} : () -> ()
   "splatBoolTensor"(){bar = dense<false> : tensor<i1>} : () -> ()
 
+  // CHECK: "splatUIntTensor"() {bar = dense<222> : tensor<2x1x4xui8>} : () -> ()
+  "splatUIntTensor"(){bar = dense<222> : tensor<2x1x4xui8>} : () -> ()
+
   // CHECK: "splatIntTensor"() {bar = dense<5> : tensor<2x1x4xi32>} : () -> ()
   "splatIntTensor"(){bar = dense<5> : tensor<2x1x4xi32>} : () -> ()
 
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index 3e1b52718e2ee4..3b9efd6ecbdf4f 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -37,7 +37,7 @@ std::mutex *TrlTblMtx;
 HostPtrToTableMapTy *HostPtrToTableMap;
 std::mutex *TblMapMtx;
 
-__attribute__((constructor(0))) void init() {
+__attribute__((constructor(101))) void init() {
   DP("Init target library!\n");
   RTLs = new RTLsTy();
   RTLsMtx = new std::mutex();
@@ -47,7 +47,7 @@ __attribute__((constructor(0))) void init() {
   TblMapMtx = new std::mutex();
 }
 
-__attribute__((destructor(0))) void deinit() {
+__attribute__((destructor(101))) void deinit() {
   DP("Deinit target library!\n");
   delete RTLs;
   delete RTLsMtx;