From 0860db966a7d2ab61b26e41426a55189986924b4 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Thu, 19 Dec 2019 10:01:10 -0500
Subject: [PATCH 1/5] [OPENMP50]Codegen for nontemporal clause.

Summary:
Basic codegen for the declarations marked as nontemporal. Also, if the
base declaration in the member expression is marked as nontemporal,
lvalue for member decl access inherits nonteporal flag from the base
lvalue.

Reviewers: rjmccall, hfinkel, jdoerfert

Subscribers: guansong, arphaman, caomhin, kkwli0, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D71708
---
 clang/include/clang/AST/OpenMPClause.h        |  23 +++
 clang/include/clang/AST/RecursiveASTVisitor.h |   3 +
 clang/lib/AST/OpenMPClause.cpp                |  12 +-
 clang/lib/AST/StmtProfile.cpp                 |   7 +-
 clang/lib/CodeGen/CGExpr.cpp                  |  33 +++-
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  41 +++++
 clang/lib/CodeGen/CGOpenMPRuntime.h           |  19 +++
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |   5 +-
 clang/lib/Sema/SemaOpenMP.cpp                 |  46 ++++--
 clang/lib/Serialization/ASTReader.cpp         |   5 +
 clang/lib/Serialization/ASTWriter.cpp         |   2 +
 clang/test/OpenMP/distribute_simd_codegen.cpp |  12 +-
 clang/test/OpenMP/for_simd_codegen.cpp        |   5 +-
 clang/test/OpenMP/simd_codegen.cpp            |  59 ++++++-
 .../target_parallel_for_simd_codegen.cpp      |   5 +-
 clang/test/OpenMP/target_simd_codegen.cpp     | 150 +++++++++---------
 ...istribute_parallel_for_simd_if_codegen.cpp |  13 +-
 .../target_teams_distribute_simd_codegen.cpp  |   4 +-
 .../OpenMP/teams_distribute_simd_codegen.cpp  |   8 +-
 clang/tools/libclang/CIndex.cpp               |   2 +
 20 files changed, 339 insertions(+), 115 deletions(-)
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index c708b011849a31..7cef0bc9092a18 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -6275,6 +6275,15 @@ class OMPNontemporalClause final
             OMPC_nontemporal, SourceLocation(), SourceLocation(),
             SourceLocation(), N) {}
 
+  /// Get the list of privatied copies if the member expression was captured by
+  /// one of the privatization clauses.
+  MutableArrayRef<Expr *> getPrivateRefs() {
+    return MutableArrayRef<Expr *>(varlist_end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getPrivateRefs() const {
+    return llvm::makeArrayRef(varlist_end(), varlist_size());
+  }
+
 public:
   /// Creates clause with a list of variables \a VL.
   ///
@@ -6293,6 +6302,10 @@ class OMPNontemporalClause final
   /// \param N The number of variables.
   static OMPNontemporalClause *CreateEmpty(const ASTContext &C, unsigned N);
 
+  /// Sets the list of references to private copies created in private clauses.
+  /// \param VL List of references.
+  void setPrivateRefs(ArrayRef<Expr *> VL);
+
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
                        reinterpret_cast<Stmt **>(varlist_end()));
@@ -6303,6 +6316,16 @@ class OMPNontemporalClause final
     return const_child_range(Children.begin(), Children.end());
   }
 
+  child_range private_refs() {
+    return child_range(reinterpret_cast<Stmt **>(getPrivateRefs().begin()),
+                       reinterpret_cast<Stmt **>(getPrivateRefs().end()));
+  }
+
+  const_child_range private_refs() const {
+    auto Children = const_cast<OMPNontemporalClause *>(this)->private_refs();
+    return const_child_range(Children.begin(), Children.end());
+  }
+
   child_range used_children() {
     return child_range(child_iterator(), child_iterator());
   }
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index a93a01da34c160..19dd62b0fe0fd7 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3378,6 +3378,9 @@ template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPNontemporalClause(
     OMPNontemporalClause *C) {
   TRY_TO(VisitOMPClauseList(C));
+  for (auto *E : C->private_refs()) {
+    TRY_TO(TraverseStmt(E));
+  }
   return true;
 }
 
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index f062c3e463a875..790b440122980f 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1162,8 +1162,8 @@ OMPNontemporalClause *OMPNontemporalClause::Create(const ASTContext &C,
                                                    SourceLocation LParenLoc,
                                                    SourceLocation EndLoc,
                                                    ArrayRef<Expr *> VL) {
-  // Allocate space for nontemporal variables.
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(VL.size()));
+  // Allocate space for nontemporal variables + private references.
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(2 * VL.size()));
   auto *Clause =
       new (Mem) OMPNontemporalClause(StartLoc, LParenLoc, EndLoc, VL.size());
   Clause->setVarRefs(VL);
@@ -1172,10 +1172,16 @@ OMPNontemporalClause *OMPNontemporalClause::Create(const ASTContext &C,
 
 OMPNontemporalClause *OMPNontemporalClause::CreateEmpty(const ASTContext &C,
                                                         unsigned N) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(N));
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(2 * N));
   return new (Mem) OMPNontemporalClause(N);
 }
 
+void OMPNontemporalClause::setPrivateRefs(ArrayRef<Expr *> VL) {
+  assert(VL.size() == varlist_size() && "Number of private references is not "
+                                        "the same as the preallocated buffer");
+  std::copy(VL.begin(), VL.end(), varlist_end());
+}
+
 //===----------------------------------------------------------------------===//
 //  OpenMP clauses printing methods
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 56a65e8deb9886..2aa5106e90faed 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -769,10 +769,13 @@ void OMPClauseProfiler::VisitOMPIsDevicePtrClause(
     const OMPIsDevicePtrClause *C) {
   VisitOMPClauseList(C);
 }
-void OMPClauseProfiler::VisitOMPNontemporalClause(const OMPNontemporalClause *C) {
+void OMPClauseProfiler::VisitOMPNontemporalClause(
+    const OMPNontemporalClause *C) {
   VisitOMPClauseList(C);
+  for (auto *E : C->private_refs())
+    Profiler->VisitStmt(E);
 }
-}
+} // namespace
 
 void
 StmtProfiler::VisitOMPExecutableDirective(const OMPExecutableDirective *S) {
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index cd42faefab00b4..c729037852e2ab 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2566,21 +2566,35 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) {
       VD = VD->getCanonicalDecl();
       if (auto *FD = LambdaCaptureFields.lookup(VD))
         return EmitCapturedFieldLValue(*this, FD, CXXABIThisValue);
-      else if (CapturedStmtInfo) {
+      if (CapturedStmtInfo) {
         auto I = LocalDeclMap.find(VD);
         if (I != LocalDeclMap.end()) {
+          LValue CapLVal;
           if (VD->getType()->isReferenceType())
-            return EmitLoadOfReferenceLValue(I->second, VD->getType(),
-                                             AlignmentSource::Decl);
-          return MakeAddrLValue(I->second, T);
+            CapLVal = EmitLoadOfReferenceLValue(I->second, VD->getType(),
+                                                AlignmentSource::Decl);
+          else
+            CapLVal = MakeAddrLValue(I->second, T);
+          // Mark lvalue as nontemporal if the variable is marked as nontemporal
+          // in simd context.
+          if (getLangOpts().OpenMP &&
+              CGM.getOpenMPRuntime().isNontemporalDecl(VD))
+            CapLVal.setNontemporal(/*Value=*/true);
+          return CapLVal;
         }
         LValue CapLVal =
             EmitCapturedFieldLValue(*this, CapturedStmtInfo->lookup(VD),
                                     CapturedStmtInfo->getContextValue());
-        return MakeAddrLValue(
+        CapLVal = MakeAddrLValue(
             Address(CapLVal.getPointer(*this), getContext().getDeclAlign(VD)),
             CapLVal.getType(), LValueBaseInfo(AlignmentSource::Decl),
             CapLVal.getTBAAInfo());
+        // Mark lvalue as nontemporal if the variable is marked as nontemporal
+        // in simd context.
+        if (getLangOpts().OpenMP &&
+            CGM.getOpenMPRuntime().isNontemporalDecl(VD))
+          CapLVal.setNontemporal(/*Value=*/true);
+        return CapLVal;
       }
 
       assert(isa<BlockDecl>(CurCodeDecl));
@@ -3929,6 +3943,15 @@ LValue CodeGenFunction::EmitMemberExpr(const MemberExpr *E) {
   if (auto *Field = dyn_cast<FieldDecl>(ND)) {
     LValue LV = EmitLValueForField(BaseLV, Field);
     setObjCGCLValueClass(getContext(), E, LV);
+    if (getLangOpts().OpenMP) {
+      // If the member was explicitly marked as nontemporal, mark it as
+      // nontemporal. If the base lvalue is marked as nontemporal, mark access
+      // to children as nontemporal too.
+      if ((IsWrappedCXXThis(BaseExpr) &&
+           CGM.getOpenMPRuntime().isNontemporalDecl(Field)) ||
+          BaseLV.isNontemporal())
+        LV.setNontemporal(/*Value=*/true);
+    }
     return LV;
   }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 1aae18b4504c8f..2a53e9993533c1 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -17,6 +17,7 @@
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
+#include "clang/AST/OpenMPClause.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/BitmaskEnum.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
@@ -11341,6 +11342,46 @@ bool CGOpenMPRuntime::emitDeclareVariant(GlobalDecl GD, bool IsForDefinition) {
   return true;
 }
 
+CGOpenMPRuntime::NontemporalDeclsRAII::NontemporalDeclsRAII(
+    CodeGenModule &CGM, const OMPLoopDirective &S)
+    : CGM(CGM), NeedToPush(S.hasClausesOfKind<OMPNontemporalClause>()) {
+  assert(CGM.getLangOpts().OpenMP && "Not in OpenMP mode.");
+  if (!NeedToPush)
+    return;
+  NontemporalDeclsSet &DS =
+      CGM.getOpenMPRuntime().NontemporalDeclsStack.emplace_back();
+  for (const auto *C : S.getClausesOfKind<OMPNontemporalClause>()) {
+    for (const Stmt *Ref : C->private_refs()) {
+      const auto *SimpleRefExpr = cast<Expr>(Ref)->IgnoreParenImpCasts();
+      const ValueDecl *VD;
+      if (const auto *DRE = dyn_cast<DeclRefExpr>(SimpleRefExpr)) {
+        VD = DRE->getDecl();
+      } else {
+        const auto *ME = cast<MemberExpr>(SimpleRefExpr);
+        assert((ME->isImplicitCXXThis() ||
+                isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts())) &&
+               "Expected member of current class.");
+        VD = ME->getMemberDecl();
+      }
+      DS.insert(VD);
+    }
+  }
+}
+
+CGOpenMPRuntime::NontemporalDeclsRAII::~NontemporalDeclsRAII() {
+  if (!NeedToPush)
+    return;
+  CGM.getOpenMPRuntime().NontemporalDeclsStack.pop_back();
+}
+
+bool CGOpenMPRuntime::isNontemporalDecl(const ValueDecl *VD) const {
+  assert(CGM.getLangOpts().OpenMP && "Not in OpenMP mode.");
+
+  return llvm::any_of(
+      CGM.getOpenMPRuntime().NontemporalDeclsStack,
+      [VD](const NontemporalDeclsSet &Set) { return Set.count(VD) > 0; });
+}
+
 llvm::Function *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index d806c27fdcba9d..f1582457ed2daa 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -211,6 +211,16 @@ class CGOpenMPRuntime {
     ~DisableAutoDeclareTargetRAII();
   };
 
+  /// Manages list of nontemporal decls for the specified directive.
+  class NontemporalDeclsRAII {
+    CodeGenModule &CGM;
+    const bool NeedToPush;
+
+  public:
+    NontemporalDeclsRAII(CodeGenModule &CGM, const OMPLoopDirective &S);
+    ~NontemporalDeclsRAII();
+  };
+
 protected:
   CodeGenModule &CGM;
   StringRef FirstSeparator, Separator;
@@ -650,6 +660,11 @@ class CGOpenMPRuntime {
                   std::pair<GlobalDecl, GlobalDecl>>
       DeferredVariantFunction;
 
+  using NontemporalDeclsSet = llvm::SmallDenseSet<CanonicalDeclPtr<const Decl>>;
+  /// Stack for list of declarations in current context marked as nontemporal.
+  /// The set is the union of all current stack elements.
+  llvm::SmallVector<NontemporalDeclsSet, 4> NontemporalDeclsStack;
+
   /// Flag for keeping track of weather a requires unified_shared_memory
   /// directive is present.
   bool HasRequiresUnifiedSharedMemory = false;
@@ -1663,6 +1678,10 @@ class CGOpenMPRuntime {
 
   /// Emits the definition of the declare variant function.
   virtual bool emitDeclareVariant(GlobalDecl GD, bool IsForDefinition);
+
+  /// Checks if the \p VD variable is marked as nontemporal declaration in
+  /// current context.
+  bool isNontemporalDecl(const ValueDecl *VD) const;
 };
 
 /// Class supports emissionof SIMD-only code.
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index e8a3790e3b5c62..a2a59b64ddb327 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1803,8 +1803,9 @@ static LValue EmitOMPHelperVar(CodeGenFunction &CGF,
 static void emitCommonSimdLoop(CodeGenFunction &CGF, const OMPLoopDirective &S,
                                const RegionCodeGenTy &SimdInitGen,
                                const RegionCodeGenTy &BodyCodeGen) {
-  auto &&ThenGen = [&SimdInitGen, &BodyCodeGen](CodeGenFunction &CGF,
-                                                PrePostActionTy &) {
+  auto &&ThenGen = [&S, &SimdInitGen, &BodyCodeGen](CodeGenFunction &CGF,
+                                                    PrePostActionTy &) {
+    CGOpenMPRuntime::NontemporalDeclsRAII NontemporalsRegion(CGF.CGM, S);
     CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF);
     SimdInitGen(CGF);
 
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 915bf53d06e30f..7365fda62beee3 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2230,6 +2230,11 @@ void Sema::EndOpenMPClause() {
 
 static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
                                  ArrayRef<OMPClause *> Clauses);
+static std::pair<ValueDecl *, bool>
+getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc,
+               SourceRange &ERange, bool AllowArraySection = false);
+static DeclRefExpr *buildCapture(Sema &S, ValueDecl *D, Expr *CaptureExpr,
+                                 bool WithInit);
 
 void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
   // OpenMP [2.14.3.5, Restrictions, C/C++, p.1]
@@ -2274,6 +2279,31 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
           }
         }
         Clause->setPrivateCopies(PrivateCopies);
+        continue;
+      }
+      // Finalize nontemporal clause by handling private copies, if any.
+      if (auto *Clause = dyn_cast<OMPNontemporalClause>(C)) {
+        SmallVector<Expr *, 8> PrivateRefs;
+        for (Expr *RefExpr : Clause->varlists()) {
+          assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
+          SourceLocation ELoc;
+          SourceRange ERange;
+          Expr *SimpleRefExpr = RefExpr;
+          auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+          if (Res.second)
+            // It will be analyzed later.
+            PrivateRefs.push_back(RefExpr);
+          ValueDecl *D = Res.first;
+          if (!D)
+            continue;
+
+          const DSAStackTy::DSAVarData DVar =
+              DSAStack->getTopDSA(D, /*FromParent=*/false);
+          PrivateRefs.push_back(DVar.PrivateCopy ? DVar.PrivateCopy
+                                                 : SimpleRefExpr);
+        }
+        Clause->setPrivateRefs(PrivateRefs);
+        continue;
       }
     }
     // Check allocate clauses.
@@ -4262,9 +4292,10 @@ static bool checkIfClauses(Sema &S, OpenMPDirectiveKind Kind,
   return ErrorFound;
 }
 
-static std::pair<ValueDecl *, bool>
-getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc,
-               SourceRange &ERange, bool AllowArraySection = false) {
+static std::pair<ValueDecl *, bool> getPrivateItem(Sema &S, Expr *&RefExpr,
+                                                   SourceLocation &ELoc,
+                                                   SourceRange &ERange,
+                                                   bool AllowArraySection) {
   if (RefExpr->isTypeDependent() || RefExpr->isValueDependent() ||
       RefExpr->containsUnexpandedParameterPack())
     return std::make_pair(nullptr, true);
@@ -17172,8 +17203,6 @@ OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
     if (!D)
       continue;
 
-    auto *VD = dyn_cast<VarDecl>(D);
-
     // OpenMP 5.0, 2.9.3.1 simd Construct, Restrictions.
     // A list-item cannot appear in more than one nontemporal clause.
     if (const Expr *PrevRef =
@@ -17185,12 +17214,7 @@ OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
       continue;
     }
 
-    DeclRefExpr *Ref = nullptr;
-    if (!VD && isOpenMPCapturedDecl(D) && !CurContext->isDependentContext())
-      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
-    Vars.push_back((VD || !Ref || CurContext->isDependentContext())
-                       ? RefExpr->IgnoreParens()
-                       : Ref);
+    Vars.push_back(RefExpr);
   }
 
   if (Vars.empty())
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 9258a6fb1265a1..032ce888969e20 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12466,4 +12466,9 @@ void OMPClauseReader::VisitOMPNontemporalClause(OMPNontemporalClause *C) {
   for (unsigned i = 0; i != NumVars; ++i)
     Vars.push_back(Record.readSubExpr());
   C->setVarRefs(Vars);
+  Vars.clear();
+  Vars.reserve(NumVars);
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Record.readSubExpr());
+  C->setPrivateRefs(Vars);
 }
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index a632072d01bd78..2e2ce0e875d000 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6544,4 +6544,6 @@ void OMPClauseWriter::VisitOMPNontemporalClause(OMPNontemporalClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists())
     Record.AddStmt(VE);
+  for (auto *E : C->private_refs())
+    Record.AddStmt(E);
 }
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index f0128a977d68f8..104b1ba91b664c 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -143,7 +143,7 @@ void static_not_chunked(float *a, float *b, float *c, float *d) {
   #pragma omp target
   #pragma omp teams
 #ifdef OMP5
-  #pragma omp distribute simd dist_schedule(static) safelen(32) if(simd: true)
+  #pragma omp distribute simd dist_schedule(static) safelen(32) if(simd: true) nontemporal(a, b)
 #else
   #pragma omp distribute simd dist_schedule(static) safelen(32)
 #endif // OMP5
@@ -189,6 +189,11 @@ void static_not_chunked(float *a, float *b, float *c, float *d) {
 // CHECK:  [[BBINNBODY]]:
 // CHECK:  {{.+}} = load i32, i32* [[IV]]
 // ... loop body ...
+// OMP45-NOT: !nontemporal
+// OMP50:  load float*,{{.*}}!nontemporal
+// OMP50:  load float*,{{.*}}!nontemporal
+// OMP50-NOT: !nontemporal
+
 // CHECK:  br label %[[BBBODYCONT:.+]]
 // CHECK:  [[BBBODYCONT]]:
 // CHECK:  br label %[[BBINNINC:.+]]
@@ -271,7 +276,7 @@ void test_precond() {
   #pragma omp target
   #pragma omp teams
 #ifdef OMP5
-  #pragma omp distribute simd linear(i) if(a)
+  #pragma omp distribute simd linear(i) if(a) nontemporal(i)
 #else
   #pragma omp distribute simd linear(i)
 #endif // OMP5
@@ -293,6 +298,9 @@ void test_precond() {
 // CHECK:  br i1 [[ACMP]], label %[[PRECOND_THEN:.+]], label %[[PRECOND_END:.+]]
 // CHECK:  [[PRECOND_THEN]]
 // CHECK:  call void @__kmpc_for_static_init_4
+// OMP45-NOT: !nontemporal
+// OMP50:  store i8 {{.*}}!nontemporal
+// OMP50-NOT: !nontemporal
 // CHECK:  call void @__kmpc_for_static_fini
 // CHECK:  [[PRECOND_END]]
 
diff --git a/clang/test/OpenMP/for_simd_codegen.cpp b/clang/test/OpenMP/for_simd_codegen.cpp
index 4bc6b362f0278e..a668cb77a06839 100644
--- a/clang/test/OpenMP/for_simd_codegen.cpp
+++ b/clang/test/OpenMP/for_simd_codegen.cpp
@@ -333,7 +333,7 @@ void simple(float *a, float *b, float *c, float *d) {
 // OMP50: br i1 [[COND]], label {{%?}}[[THEN:[^,]+]], label {{%?}}[[ELSE:[^,]+]]
 // OMP50: [[THEN]]:
 #ifdef OMP5
-  #pragma omp for simd reduction(*:R) if (simd:A)
+  #pragma omp for simd reduction(*:R) if (simd:A) nontemporal(R)
 #else
   #pragma omp for simd reduction(*:R)
 #endif
@@ -366,7 +366,8 @@ void simple(float *a, float *b, float *c, float *d) {
 // CHECK-NEXT: [[LC_IT_2:%.+]] = add nsw i64 -10, [[LC_IT_1]]
 // CHECK-NEXT: store i64 [[LC_IT_2]], i64* [[LC:%[^,]+]],
 // CHECK-NEXT: [[LC_VAL:%.+]] = load i64, i64* [[LC]]
-// CHECK: store i32 %{{.+}}, i32* [[R_PRIV]],
+// OMP45: store i32 %{{.+}}, i32* [[R_PRIV]],
+// OMP50: store i32 %{{.+}}, i32* [[R_PRIV]],{{.*}}!nontemporal
     R *= i;
 // CHECK: [[IV8_2:%.+]] = load i64, i64* [[OMP_IV8]]
 // CHECK-NEXT: [[ADD8_2:%.+]] = add nsw i64 [[IV8_2]], 1
diff --git a/clang/test/OpenMP/simd_codegen.cpp b/clang/test/OpenMP/simd_codegen.cpp
index 19ed323af42662..aac76ad127dc8f 100644
--- a/clang/test/OpenMP/simd_codegen.cpp
+++ b/clang/test/OpenMP/simd_codegen.cpp
@@ -22,10 +22,15 @@
 long long get_val() { return 0; }
 double *g_ptr;
 
+struct S {
+  int a, b;
+};
+
 // CHECK-LABEL: define {{.*void}} @{{.*}}simple{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void simple(float *a, float *b, float *c, float *d) {
+  S s, *p;
 #ifdef OMP5
-  #pragma omp simd if (simd: true)
+  #pragma omp simd if (simd: true) nontemporal(a, b, c, d, s)
 #else
   #pragma omp simd
 #endif
@@ -43,8 +48,17 @@ void simple(float *a, float *b, float *c, float *d) {
 // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]{{.*}}!llvm.access.group
 // ... loop body ...
 // End of body: store into a[i]:
+// OMP45-NOT: load float*,{{.*}}!nontemporal
+// CHECK-NOT: load float,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// OMP50: load i32,{{.*}}!nontemporal
+// OMP50-NOT: load i32,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// CHECK-NOT: load float,{{.*}}!nontemporal
 // CHECK: store float [[RESULT:%.+]], float* {{%.+}}{{.*}}!llvm.access.group
-    a[i] = b[i] * c[i] * d[i];
+    a[i] = b[i] * c[i] * d[i] + s.a + p->a;
 // CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}!llvm.access.group
 // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
 // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]{{.*}}!llvm.access.group
@@ -718,6 +732,47 @@ void linear(float *a) {
 //
 }
 
+#ifdef OMP5
+// OMP50-LABEL: inner_simd
+void inner_simd() {
+  double a, b;
+#pragma omp simd nontemporal(a)
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd nontemporal(b)
+    for (int k = 0; k < 10; ++k) {
+      // OMP50: load double,{{.*}}!nontemporal
+      // OMP50: store double{{.*}}!nontemporal
+      a = b;
+    }
+    // OMP50-NOT: load double,{{.*}}!nontemporal
+    // OMP50: load double,
+    // OMP50: store double{{.*}}!nontemporal
+    a = b;
+  }
+}
+
+extern struct T t;
+struct Base {
+  float a;
+};
+struct T : public Base {
+  void foo() {
+#pragma omp simd nontemporal(Base::a)
+    for (int i = 0; i < 10; ++i) {
+    // OMP50: store float{{.*}}!nontemporal
+    // OMP50-NOT: nontemporal
+    // OMP50-NEXT: store float
+      Base::a = 0;
+      t.a = 0;
+    }
+  }
+} t;
+
+void bartfoo() {
+  t.foo();
+}
+
+#endif // OMP5
 // TERM_DEBUG-LABEL: bar
 int bar() {return 0;};
 
diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
index 5e1ed2d6275b04..055d5dce28bbd0 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -539,7 +539,7 @@ struct S1 {
     short int c[2][n];
 
 #ifdef OMP5
-    #pragma omp target parallel for simd if(n>60)
+    #pragma omp target parallel for simd if(n>60) nontemporal(a)
 #else
     #pragma omp target parallel for simd if(target: n>60)
 #endif // OMP5
@@ -837,6 +837,9 @@ int bar(int n){
 // OMP45:       define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* {{.+}})
 // OMP50:       define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* {{.+}}, i[[SZ]] %{{.+}})
 // To reduce complexity, we're only going as far as validating the signature of the outlined parallel function.
+// OMP45-NOT:   !nontemporal
+// OMP50:       store double{{.*}}!nontemporal
+// OMP50:       load double{{.*}}!nontemporal
 
 
 // CHECK:       define internal void [[HVT6]]
diff --git a/clang/test/OpenMP/target_simd_codegen.cpp b/clang/test/OpenMP/target_simd_codegen.cpp
index 6755b7e657e303..597cff7815a3bd 100644
--- a/clang/test/OpenMP/target_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_simd_codegen.cpp
@@ -85,8 +85,8 @@
 // CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i64] [i64 800, i64 800, i64 547]
 // CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i64] [i64 4, i64 2, i64 1, i64 40]
 // CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i64] [i64 800, i64 800, i64 800, i64 547]
-// OMP45-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [6 x i64] [i64 32, i64 281474976711171, i64 800, i64 800, i64 800, i64 547]
-// OMP50-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [7 x i64] [i64 32, i64 281474976711171, i64 800, i64 800, i64 800, i64 547, i64 800]
+// OMP45-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i64] [i64 547, i64 800, i64 800, i64 800, i64 547]
+// OMP50-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [6 x i64] [i64 547, i64 800, i64 800, i64 800, i64 547, i64 800]
 // CHECK-DAG: @{{.*}} = weak constant i8 0
 // CHECK-DAG: @{{.*}} = weak constant i8 0
 // CHECK-DAG: @{{.*}} = weak constant i8 0
@@ -461,7 +461,11 @@ struct S1 {
     int b = n+1;
     short int c[2][n];
 
-    #pragma omp target simd if(n>60)
+#ifdef OMP5
+    #pragma omp target simd if(n>60) nontemporal(a) private(a)
+#else
+    #pragma omp target simd if(n>60) private(a)
+#endif // OMP5
     for (unsigned long long it = 2000; it >= 600; it -= 400) {
       this->a = (double)b + 1.5;
       c[1][1] = ++a;
@@ -519,96 +523,84 @@ int bar(int n){
 // CHECK-32:    [[CSZSIZE:%.+]] = mul nuw i32 [[CELEMSIZE2]], 2
 // CHECK-32:    [[CSIZE:%.+]] = sext i32 [[CSZSIZE]] to i64
 
-// OMP45-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 6, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([6 x i64], [6 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
-// OMP50-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 7, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([7 x  i64], [7 x  i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
-// OMP45-DAG:   [[BPR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP:%.+]], i32 0, i32 0
-// OMP45-DAG:   [[PR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P:%.+]], i32 0, i32 0
-// OMP45-DAG:   [[SR]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S:%.+]], i32 0, i32 0
-// OMP45-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX0:[0-9]+]]
-// OMP45-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX0]]
-// OMP45-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX0]]
-// OMP45-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX1:[0-9]+]]
-// OMP45-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX1]]
-// OMP45-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX1]]
-// OMP45-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX2:[0-9]+]]
-// OMP45-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX2]]
-// OMP45-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX2]]
-// OMP45-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX3:[0-9]+]]
-// OMP45-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX3]]
-// OMP45-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX3]]
-// OMP45-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX4:[0-9]+]]
-// OMP45-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX4]]
-// OMP45-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX4]]
-// OMP45-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX5:[0-9]+]]
-// OMP45-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX5]]
-// OMP45-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX5]]
-// OMP50-DAG:   [[BPR]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP:%.+]], i32 0, i32 0
-// OMP50-DAG:   [[PR]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P:%.+]], i32 0, i32 0
-// OMP50-DAG:   [[SR]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S:%.+]], i32 0, i32 0
-// OMP50-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX0:[0-9]+]]
-// OMP50-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX0]]
-// OMP50-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX0]]
-// OMP50-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX1:[0-9]+]]
-// OMP50-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX1]]
-// OMP50-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX1]]
-// OMP50-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX2:[0-9]+]]
-// OMP50-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX2]]
-// OMP50-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX2]]
-// OMP50-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX3:[0-9]+]]
-// OMP50-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX3]]
-// OMP50-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX3]]
-// OMP50-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX4:[0-9]+]]
-// OMP50-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX4]]
-// OMP50-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX4]]
-// OMP50-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX5:[0-9]+]]
-// OMP50-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX5]]
-// OMP50-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX5]]
-// OMP50-DAG:   [[SADDR6:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX6:[0-9]+]]
-// OMP50-DAG:   [[BPADDR6:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX6]]
-// OMP50-DAG:   [[PADDR6:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX6]]
+// OMP45-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
+// OMP50-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 6, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([6 x  i64], [6 x  i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
+// OMP45-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
+// OMP45-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
+// OMP45-DAG:   [[SR]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S:%.+]], i32 0, i32 0
+// OMP45-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX0:[0-9]+]]
+// OMP45-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX0]]
+// OMP45-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX0]]
+// OMP45-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX1:[0-9]+]]
+// OMP45-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX1]]
+// OMP45-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX1]]
+// OMP45-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX2:[0-9]+]]
+// OMP45-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX2]]
+// OMP45-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX2]]
+// OMP45-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX3:[0-9]+]]
+// OMP45-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX3]]
+// OMP45-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX3]]
+// OMP45-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX4:[0-9]+]]
+// OMP45-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX4]]
+// OMP45-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX4]]
+// OMP50-DAG:   [[BPR]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP:%.+]], i32 0, i32 0
+// OMP50-DAG:   [[PR]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P:%.+]], i32 0, i32 0
+// OMP50-DAG:   [[SR]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S:%.+]], i32 0, i32 0
+// OMP50-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX0:[0-9]+]]
+// OMP50-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX0]]
+// OMP50-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX0]]
+// OMP50-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX1:[0-9]+]]
+// OMP50-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX1]]
+// OMP50-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX1]]
+// OMP50-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX2:[0-9]+]]
+// OMP50-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX2]]
+// OMP50-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX2]]
+// OMP50-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX3:[0-9]+]]
+// OMP50-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX3]]
+// OMP50-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX3]]
+// OMP50-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX4:[0-9]+]]
+// OMP50-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX4]]
+// OMP50-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX4]]
+// OMP50-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX5:[0-9]+]]
+// OMP50-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX5]]
+// OMP50-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX5]]
 
 // The names below are not necessarily consistent with the names used for the
 // addresses above as some are repeated.
 // CHECK-DAG:   store [[S1]]* %{{.+}}, [[S1]]** [[CBPADDR0:%.+]],
-// CHECK-DAG:   store double* %{{.+}}, double** [[CPADDR0:%.+]],
+// CHECK-DAG:   store [[S1]]* %{{.+}}, [[S1]]** [[CPADDR0:%.+]],
 // CHECK-DAG:   [[CBPADDR0]] = bitcast i8** {{%[^,]+}} to [[S1]]**
-// CHECK-DAG:   [[CPADDR0]] = bitcast i8** {{%[^,]+}} to double**
-// CHECK-DAG:   store i64 %{{.+}}, i64* {{%[^,]+}}
-
-// CHECK-DAG:   store [[S1]]* %{{.+}}, [[S1]]** [[CBPADDR1:%.+]],
-// CHECK-DAG:   store double* %{{.+}}, double** [[CPADDR1:%.+]],
-// CHECK-DAG:   [[CBPADDR1]] = bitcast i8** {{%[^,]+}} to [[S1]]**
-// CHECK-DAG:   [[CPADDR1]] = bitcast i8** {{%[^,]+}} to double**
+// CHECK-DAG:   [[CPADDR0]] = bitcast i8** {{%[^,]+}} to [[S1]]**
 // CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
 
-// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CBPADDR2:%.+]],
-// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CPADDR2:%.+]],
+// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CBPADDR1:%.+]],
+// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CPADDR1:%.+]],
+// CHECK-DAG:   [[CBPADDR1]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// CHECK-DAG:   [[CPADDR1]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// CHECK-DAG:   store i64 4, i64* {{%[^,]+}}
+
+// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CBPADDR2:%.+]],
+// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CPADDR2:%.+]],
 // CHECK-DAG:   [[CBPADDR2]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // CHECK-DAG:   [[CPADDR2]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// CHECK-DAG:   store i64 4, i64* {{%[^,]+}}
+// CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
 
-// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CBPADDR3:%.+]],
-// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CPADDR3:%.+]],
+// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CBPADDR3:%.+]],
+// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CPADDR3:%.+]],
 // CHECK-DAG:   [[CBPADDR3]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // CHECK-DAG:   [[CPADDR3]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
 
-// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CBPADDR4:%.+]],
-// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CPADDR4:%.+]],
-// CHECK-DAG:   [[CBPADDR4]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// CHECK-DAG:   [[CPADDR4]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
-
-// CHECK-DAG:   store i16* %{{.+}}, i16** [[CBPADDR5:%.+]],
-// CHECK-DAG:   store i16* %{{.+}}, i16** [[CPADDR5:%.+]],
-// CHECK-DAG:   [[CBPADDR5]] = bitcast i8** {{%[^,]+}} to i16**
-// CHECK-DAG:   [[CPADDR5]] = bitcast i8** {{%[^,]+}} to i16**
+// CHECK-DAG:   store i16* %{{.+}}, i16** [[CBPADDR4:%.+]],
+// CHECK-DAG:   store i16* %{{.+}}, i16** [[CPADDR4:%.+]],
+// CHECK-DAG:   [[CBPADDR4]] = bitcast i8** {{%[^,]+}} to i16**
+// CHECK-DAG:   [[CPADDR4]] = bitcast i8** {{%[^,]+}} to i16**
 // CHECK-DAG:   store i64 [[CSIZE]], i64* {{%[^,]+}}
 
-// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CBPADDR6:%.+]],
-// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CPADDR6:%.+]],
-// OMP50-DAG:   [[CBPADDR6]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// OMP50-DAG:   [[CPADDR6]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CBPADDR5:%.+]],
+// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CPADDR5:%.+]],
+// OMP50-DAG:   [[CBPADDR5]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// OMP50-DAG:   [[CPADDR5]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // OMP50-DAG:   store i64 1, i64* {{%[^,]+}}
 
 // CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
@@ -738,6 +730,10 @@ int bar(int n){
 // OMP50-DAG:   [[CONV_COND:%.+]] = bitcast i[[SZ]]* [[LOCAL_SIMD_COND_CASTED]] to i8*
 // OMP50-DAG:   [[SIMD_COND:%.+]] = load i8, i8* [[CONV_COND]],
 // OMP50-DAG:   trunc i8 [[SIMD_COND]] to i1
+// OMP45-NOT:   !nontemporal
+// OMP50:       store double {{.*}}!nontemporal
+// OMP50:       load double, {{.*}}!nontemporal
+// OMP50:       store double {{.*}}!nontemporal
 
 // CHECK:       define internal void [[HVT6]]
 // Create local storage for each capture.
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
index 8a8d64e36f8d1d..dda468e604ebaf 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
@@ -35,20 +35,25 @@ void gtid_test() {
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
 // CHECK: call void [[OFFLOADING_FUN_1:@.+]](
 #ifdef OMP5
-#pragma omp target teams distribute parallel for simd if(simd: true)
+#pragma omp target teams distribute parallel for simd if(simd: true) nontemporal(Arg)
 #else
 #pragma omp target teams distribute parallel for simd
 #endif // OMP5
-  for(int i = 0 ; i < 100; i++) {}
+  for (int i = 0; i < 100; i++) {
+    Arg = 0;
+  }
   // CHECK: define internal void [[OFFLOADING_FUN_0]](
-  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
   // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
   // CHECK: call void @__kmpc_for_static_init_4(
-  // CHECK:  call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void
+  // OMP50: load i32,{{.*}}!nontemporal
+  // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 3, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void
   // CHECK: call void @__kmpc_for_static_fini(
 
   // CHECK: define{{.+}} void [[OMP_OUTLINED_0]](
   // CHECK: call void @__kmpc_for_static_init_4(
+  // OMP45-NOT: !nontemporal
+  // OMP50: store i32 0,{{.*}}!nontemporal
   // CHECK: call void @__kmpc_for_static_fini(
   // CHECK: ret
 #pragma omp target teams distribute parallel for simd if (parallel: false)
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
index 14c3cd62ebfd3a..4912352e17ca2a 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -165,7 +165,7 @@ int foo(int n) {
 
   // CHECK:       call void [[HVT1:@.+]](i[[SZ]] {{[^,]+}})
 #ifdef OMP5
-  #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a) if(simd: 1)
+  #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a) if(simd: 1) nontemporal(a)
 #else
   #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a)
 #endif // OMP5
@@ -395,6 +395,8 @@ int foo(int n) {
 // CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
 // CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
 // CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// OMP45-NOT:   !nontemporal
+// OMP50:       load i32,{{.*}}!nontemporal
 // CHECK-64:    store i32 10, i32* [[AA_CADDR]], align
 // CHECK-32:    store i32 10, i32* [[AA_ADDR]], align
 // CHECK:       ret void
diff --git a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
index b9ee8cc728fc7b..63dd237a964dbb 100644
--- a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
@@ -177,16 +177,16 @@ struct SS{
   // CK3: define {{.*}}i32 @{{.+}}foo{{.+}}(
   int foo(void) {
 
-  // CK3: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* %{{.+}}, i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 1)
+  // CK3: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* %{{.+}}, i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 1)
   // CK3: call void @[[OFFL1:.+]]([[SSI]]* %{{.+}})
     #pragma omp target
 #ifdef OMP5
-    #pragma omp teams distribute simd if(b)
+    #pragma omp teams distribute simd if(b) nontemporal(a, b)
 #else
     #pragma omp teams distribute simd
 #endif // OMP5
     for(int i = 0; i < X; i++) {
-      a[i] = (T)0;
+      a[i] = (T)b;
     }
 
       // outlined target region
@@ -197,6 +197,8 @@ struct SS{
 
   // CK3: define internal void @[[OUTL1]]({{.+}})
   // CK3: call void @__kmpc_for_static_init_4(
+  // OMP3_45-NOT: !nontemporal
+  // OMP3_50: load float,{{.*}}!nontemporal
   // CK3: call void @__kmpc_for_static_fini(
   // CK3: ret void
 
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 229abece37a4ca..94084e169faf3e 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2458,6 +2458,8 @@ void OMPClauseEnqueue::VisitOMPIsDevicePtrClause(const OMPIsDevicePtrClause *C)
 void OMPClauseEnqueue::VisitOMPNontemporalClause(
     const OMPNontemporalClause *C) {
   VisitOMPClauseList(C);
+  for (const auto *E : C->private_refs())
+    Visitor->AddStmt(E);
 }
 }
 

From 8cefc37be5aba4948936c7beb97cde7a68449f1f Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 23 Dec 2019 09:46:49 -0500
Subject: [PATCH 2/5] [DAGCombine] visitEXTRACT_SUBVECTOR - 'little to big'
 extract_subvector(bitcast()) support

This moves the X86 specific transform from rL364407
into DAGCombiner to generically handle 'little to big' cases
(for example: extract_subvector(v2i64 bitcast(v16i8))). This
allows us to remove both the x86 implementation and the aarch64
bitcast(extract_subvector(bitcast())) combine.

Earlier patches that dealt with regressions initially exposed
by this patch:
rG5e5e99c041e4
rG0b38af89e2c0

Patch by: @RKSimon (Simon Pilgrim)

Differential Revision: https://reviews.llvm.org/D63815
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 71 -------------------
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 26 -------
 llvm/test/CodeGen/AArch64/merge-store.ll      | 20 ++----
 llvm/test/CodeGen/ARM/combine-vmovdrr.ll      |  4 +-
 llvm/test/CodeGen/X86/avg-mask.ll             | 68 +++++++++---------
 llvm/test/CodeGen/X86/madd.ll                 | 12 ++--
 .../CodeGen/X86/masked_store_trunc_ssat.ll    |  4 +-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 36 +++++-----
 9 files changed, 83 insertions(+), 176 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1ae2f58415fa66..02ae11d8a00252 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18515,7 +18515,23 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
         return DAG.getBitcast(NVT, NewExtract);
       }
     }
-    // TODO - handle (DestNumElts % SrcNumElts) == 0
+    if ((DestNumElts % SrcNumElts) == 0) {
+      unsigned DestSrcRatio = DestNumElts / SrcNumElts;
+      if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) {
+        unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio;
+        EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
+                                        SrcVT.getScalarType(), NewExtNumElts);
+        if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
+            TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+          unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
+          SDLoc DL(N);
+          SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
+          SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+                                           V.getOperand(0), NewIndex);
+          return DAG.getBitcast(NVT, NewExtract);
+        }
+      }
+    }
   }
 
   // Combine:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9ae84f51407cf8..c5ea8e0f9fff90 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -618,7 +618,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
-  setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::CONCAT_VECTORS);
   setTargetDAGCombine(ISD::STORE);
   if (Subtarget->supportsAddressTopByteIgnored())
@@ -10185,74 +10184,6 @@ static SDValue performSRLCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performBitcastCombine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Remove extraneous bitcasts around an extract_subvector.
-  // For example,
-  //    (v4i16 (bitconvert
-  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
-  //  becomes
-  //    (extract_subvector ((v8i16 ...), (i64 4)))
-
-  // Only interested in 64-bit vectors as the ultimate result.
-  EVT VT = N->getValueType(0);
-  if (!VT.isVector() || VT.isScalableVector())
-    return SDValue();
-  if (VT.getSimpleVT().getSizeInBits() != 64)
-    return SDValue();
-  // Is the operand an extract_subvector starting at the beginning or halfway
-  // point of the vector? A low half may also come through as an
-  // EXTRACT_SUBREG, so look for that, too.
-  SDValue Op0 = N->getOperand(0);
-  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
-      !(Op0->isMachineOpcode() &&
-        Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
-    return SDValue();
-  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
-  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
-      return SDValue();
-  } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
-    if (idx != AArch64::dsub)
-      return SDValue();
-    // The dsub reference is equivalent to a lane zero subvector reference.
-    idx = 0;
-  }
-  // Look through the bitcast of the input to the extract.
-  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue Source = Op0->getOperand(0)->getOperand(0);
-  // If the source type has twice the number of elements as our destination
-  // type, we know this is an extract of the high or low half of the vector.
-  EVT SVT = Source->getValueType(0);
-  if (!SVT.isVector() ||
-      SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
-    return SDValue();
-
-  LLVM_DEBUG(
-      dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
-
-  // Create the simplified form to just extract the low or high half of the
-  // vector directly rather than bothering with the bitcasts.
-  SDLoc dl(N);
-  unsigned NumElements = VT.getVectorNumElements();
-  if (idx) {
-    SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
-  } else {
-    SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
-    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
-                                      Source, SubReg),
-                   0);
-  }
-}
-
 static SDValue performConcatVectorsCombine(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            SelectionDAG &DAG) {
@@ -12453,8 +12384,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performExtendCombine(N, DCI, DAG);
   case ISD::SIGN_EXTEND_INREG:
     return performSignExtendInRegCombine(N, DCI, DAG);
-  case ISD::BITCAST:
-    return performBitcastCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
   case ISD::SELECT:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 54a64052ebb6cf..cd6821c16e8b84 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45103,7 +45103,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   SDValue InVec = N->getOperand(0);
   SDValue InVecBC = peekThroughBitcasts(InVec);
   EVT InVecVT = InVec.getValueType();
-  EVT InVecBCVT = InVecBC.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
@@ -45147,31 +45146,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
         VT, SDLoc(N),
         InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
 
-  // Try to move vector bitcast after extract_subv by scaling extraction index:
-  // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
-  // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
-  if (InVec != InVecBC && InVecBCVT.isVector()) {
-    unsigned SrcNumElts = InVecBCVT.getVectorNumElements();
-    unsigned DestNumElts = InVecVT.getVectorNumElements();
-    if ((DestNumElts % SrcNumElts) == 0) {
-      unsigned DestSrcRatio = DestNumElts / SrcNumElts;
-      if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
-        unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
-        EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
-                                        InVecBCVT.getScalarType(), NewExtNumElts);
-        if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
-            TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
-          unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
-          SDLoc DL(N);
-          SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
-          SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
-                                           InVecBC, NewIndex);
-          return DAG.getBitcast(VT, NewExtract);
-        }
-      }
-    }
-  }
-
   // If we are extracting from an insert into a zero vector, replace with a
   // smaller insert into zero if we don't access less than the original
   // subvector. Don't do this for i1 vectors.
diff --git a/llvm/test/CodeGen/AArch64/merge-store.ll b/llvm/test/CodeGen/AArch64/merge-store.ll
index cd9564210e9af6..f0a53384cdd898 100644
--- a/llvm/test/CodeGen/AArch64/merge-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store.ll
@@ -42,17 +42,10 @@ define void @blam() {
 ; the fastness of unaligned accesses was not specified correctly.
 
 define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
-; SPLITTING-LABEL: merge_vec_extract_stores:
-; SPLITTING:       // %bb.0:
-; SPLITTING-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; SPLITTING-NEXT:    str d0, [x0, #24]
-; SPLITTING-NEXT:    str d1, [x0, #32]
-; SPLITTING-NEXT:    ret
-;
-; MISALIGNED-LABEL: merge_vec_extract_stores:
-; MISALIGNED:       // %bb.0:
-; MISALIGNED-NEXT:    stur q0, [x0, #24]
-; MISALIGNED-NEXT:    ret
+; CHECK-LABEL: merge_vec_extract_stores:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stur q0, [x0, #24]
+; CHECK-NEXT:    ret
   %idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
   %idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4
 
@@ -62,9 +55,4 @@ define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
   store <2 x float> %shuffle0, <2 x float>* %idx0, align 8
   store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
   ret void
-
-
-; FIXME: Ideally we would like to use a generic target for this test, but this relies
-; on suppressing store pairs.
-
 }
diff --git a/llvm/test/CodeGen/ARM/combine-vmovdrr.ll b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
index 358f7e3a983e0f..01526b371990eb 100644
--- a/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
+++ b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
@@ -9,8 +9,8 @@ declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %shuffle.i.i307, <8 x i8> %shuffl
 ; they are defined on VPRs and used on VPRs.
 ;
 ; CHECK-LABEL: motivatingExample:
-; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1]
-; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1]
 ; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
 ; CHECK-NEXT: vstr [[RES]], [r1]
 ; CHECK-NEXT: bx lr
diff --git a/llvm/test/CodeGen/X86/avg-mask.ll b/llvm/test/CodeGen/X86/avg-mask.ll
index a7ce07ab0cd971..ebe9e75ab4f3eb 100644
--- a/llvm/test/CodeGen/X86/avg-mask.ll
+++ b/llvm/test/CodeGen/X86/avg-mask.ll
@@ -130,9 +130,9 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
 ; AVX512F-NEXT:    shrq $32, %rdi
 ; AVX512F-NEXT:    shrq $48, %rax
 ; AVX512F-NEXT:    shrl $16, %ecx
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT:    vpavgb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm4
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    kmovw %eax, %k3
@@ -142,14 +142,14 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
 ; AVX512F-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm5, %xmm5
 ; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
-; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
-; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm2, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: avg_v64i8_mask:
@@ -178,9 +178,9 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
 ; AVX512F-NEXT:    shrq $32, %rdi
 ; AVX512F-NEXT:    shrq $48, %rax
 ; AVX512F-NEXT:    shrl $16, %ecx
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpavgb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    kmovw %eax, %k3
@@ -190,14 +190,14 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
 ; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpand %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: avg_v64i8_maskz:
@@ -330,18 +330,18 @@ define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    shrl $16, %edi
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT:    vpavgw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm4
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    kmovw %edi, %k2
 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
-; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
-; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
-; AVX512F-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm2, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: avg_v32i16_mask:
@@ -366,18 +366,18 @@ define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nou
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    shrl $16, %edi
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpavgw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    kmovw %edi, %k2
 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT:    vpand %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: avg_v32i16_maskz:
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 62792ec074aedf..11756574217ef9 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -1975,9 +1975,9 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
 ;
 ; AVX512F-LABEL: pmaddwd_32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -2188,9 +2188,9 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
 ;
 ; AVX512F-LABEL: jumbled_indices16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index cc73f563581ad0..b79fb5c35f3d5f 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -6374,9 +6374,9 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
 ;
 ; AVX512F-LABEL: truncstore_v32i16_v32i8:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpacksswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512F-NEXT:    vpmovmskb %ymm1, %eax
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 720cabee9122c8..7f00b49b81f850 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -725,33 +725,33 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;

From 79b3325be0b016fdc1a2c55bce65ec9f1e5f4eb6 Mon Sep 17 00:00:00 2001
From: czhengsz <czhengsz@cn.ibm.com>
Date: Mon, 23 Dec 2019 10:26:41 -0500
Subject: [PATCH 3/5] [PowerPC] NFC - fix the testcase bug of folding rlwinm

---
 llvm/test/CodeGen/PowerPC/fold-rlwinm.mir | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
index 426aaa7a763133..cf96bc11639078 100644
--- a/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
+++ b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
@@ -11,7 +11,7 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 5, 31
-    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 5, 31
+    ; CHECK-NOT: %2:gprc = RLWINM %1, 27, 5, 31
     %3:gprc = RLWINM %2:gprc, 19, 0, 12
     ; CHECK: %3:gprc = RLWINM %1, 14, 0, 12
     BLR8 implicit $lr8, implicit $rm
@@ -26,7 +26,7 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 0, 31
-    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 0, 31
+    ; CHECK-NOT: %2:gprc = RLWINM %1, 27, 0, 31
     %3:gprc = RLWINM %2:gprc, 19, 0, 12
     ; CHECK: %3:gprc = RLWINM %1, 14, 0, 12
     BLR8 implicit $lr8, implicit $rm
@@ -41,7 +41,7 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 10, 9 
-    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 10, 9
+    ; CHECK-NOT: %2:gprc = RLWINM %1, 27, 10, 9
     %3:gprc = RLWINM %2:gprc, 19, 10, 1
     ; CHECK: %3:gprc = RLWINM %1, 14, 10, 1
     BLR8 implicit $lr8, implicit $rm
@@ -56,7 +56,7 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 30, 10
-    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 30 ,10 
+    ; CHECK-NOT: %2:gprc = RLWINM %1, 27, 30, 10 
     %3:gprc = RLWINM %2:gprc, 19, 0, 12
     ; CHECK: %3:gprc = RLWINM %1, 14, 11, 12
     BLR8 implicit $lr8, implicit $rm
@@ -71,7 +71,7 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 10, 5, 31
-    ; CHECKT: %2:gprc = RLWINM %1:gprc, 10, 5, 31
+    ; CHECK: %2:gprc = RLWINM %1, 10, 5, 31
     %3:gprc = RLWINM %2:gprc, 10, 30, 5
     ; CHECK: %3:gprc = RLWINM %2, 10, 30, 5
     BLR8 implicit $lr8, implicit $rm
@@ -103,7 +103,7 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 5, 10
-    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 5, 10
+    ; CHECK: %2:gprc = RLWINM %1, 27, 5, 10
     %3:gprc = RLWINM %2:gprc, 8, 5, 10
     ; CHECK: %3:gprc = LI 0
     BLR8 implicit $lr8, implicit $rm
@@ -118,7 +118,7 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 5, 10
-    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 5, 10
+    ; CHECK: %2:gprc = RLWINM %1, 27, 5, 10
     %3:gprc = RLWINMo %2:gprc, 8, 5, 10, implicit-def $cr0
     ; CHECK: %3:gprc = ANDIo %2, 0, implicit-def $cr0
     BLR8 implicit $lr8, implicit $rm

From 5b1d0dc6bf0618690057bea749830036f70b0491 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Marques?= <luismarques@lowrisc.org>
Date: Mon, 23 Dec 2019 15:32:52 +0000
Subject: [PATCH 4/5] [RISCV][NFC] Fix use of missing attribute groups in tests

---
 llvm/test/CodeGen/RISCV/frame-info.ll | 2 +-
 llvm/test/CodeGen/RISCV/frame.ll      | 2 +-
 llvm/test/CodeGen/RISCV/remat.ll      | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/frame-info.ll b/llvm/test/CodeGen/RISCV/frame-info.ll
index 4908dec3501366..0b1c1bde05e515 100644
--- a/llvm/test/CodeGen/RISCV/frame-info.ll
+++ b/llvm/test/CodeGen/RISCV/frame-info.ll
@@ -143,7 +143,7 @@ define void @stack_alloc(i32 signext %size) {
 ; RV64-WITHFP-NEXT:    ret
 entry:
   %0 = alloca i8, i32 %size, align 16
-  call void @callee_with_args(i8* nonnull %0) #2
+  call void @callee_with_args(i8* nonnull %0)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/RISCV/frame.ll b/llvm/test/CodeGen/RISCV/frame.ll
index e8a833f37f83b2..6f6be73e708079 100644
--- a/llvm/test/CodeGen/RISCV/frame.ll
+++ b/llvm/test/CodeGen/RISCV/frame.ll
@@ -45,7 +45,7 @@ define i32 @test() nounwind {
   %1 = bitcast %struct.key_t* %key to i8*
   call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 20, i1 false)
   %2 = getelementptr inbounds %struct.key_t, %struct.key_t* %key, i64 0, i32 1, i64 0
-  call void @test1(i8* %2) #3
+  call void @test1(i8* %2)
   ret i32 0
 }
 
diff --git a/llvm/test/CodeGen/RISCV/remat.ll b/llvm/test/CodeGen/RISCV/remat.ll
index e440515d6322da..b3be74cd76dbfc 100644
--- a/llvm/test/CodeGen/RISCV/remat.ll
+++ b/llvm/test/CodeGen/RISCV/remat.ll
@@ -144,7 +144,7 @@ if.then:                                          ; preds = %for.body
   %3 = load i32, i32* @c, align 4
   %4 = load i32, i32* @d, align 4
   %5 = load i32, i32* @e, align 4
-  %call = tail call i32 @foo(i32 %0, i32 %2, i32 %3, i32 %4, i32 %5, i32 32) #3
+  %call = tail call i32 @foo(i32 %0, i32 %2, i32 %3, i32 %4, i32 %5, i32 32)
   br label %if.end
 
 if.end:                                           ; preds = %for.body, %if.then
@@ -158,7 +158,7 @@ if.then3:                                         ; preds = %if.end
   %9 = load i32, i32* @d, align 4
   %10 = load i32, i32* @e, align 4
   %11 = load i32, i32* @f, align 4
-  %call4 = tail call i32 @foo(i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 64) #3
+  %call4 = tail call i32 @foo(i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 64)
   br label %if.end5
 
 if.end5:                                          ; preds = %if.end, %if.then3
@@ -172,7 +172,7 @@ if.then7:                                         ; preds = %if.end5
   %15 = load i32, i32* @e, align 4
   %16 = load i32, i32* @f, align 4
   %17 = load i32, i32* @g, align 4
-  %call8 = tail call i32 @foo(i32 %13, i32 %14, i32 %15, i32 %16, i32 %17, i32 32) #3
+  %call8 = tail call i32 @foo(i32 %13, i32 %14, i32 %15, i32 %16, i32 %17, i32 32)
   br label %if.end9
 
 if.end9:                                          ; preds = %if.end5, %if.then7
@@ -186,7 +186,7 @@ if.then11:                                        ; preds = %if.end9
   %21 = load i32, i32* @f, align 4
   %22 = load i32, i32* @g, align 4
   %23 = load i32, i32* @h, align 4
-  %call12 = tail call i32 @foo(i32 %19, i32 %20, i32 %21, i32 %22, i32 %23, i32 32) #3
+  %call12 = tail call i32 @foo(i32 %19, i32 %20, i32 %21, i32 %22, i32 %23, i32 32)
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end9, %if.then11

From c7c05b0c8a046c9bef46b4e4c7a35c262d1d880a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 23 Dec 2019 13:42:12 +0000
Subject: [PATCH 5/5] [AMDGPU] Don't create MachinePointerInfos with an
 UndefValue pointer

Summary:
The only useful information the UndefValue conveys is the address space,
which MachinePointerInfo can represent directly without referring to an
IR value.

Reviewers: arsenm, rampitec

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71838
---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |   3 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   8 +-
 llvm/lib/Target/AMDGPU/R600ISelLowering.cpp   |  12 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  10 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  12 +-
 .../GlobalISel/irtranslator-amdgpu_kernel.ll  | 152 +++++++++---------
 .../GlobalISel/legalize-addrspacecast.mir     |   8 +-
 7 files changed, 91 insertions(+), 114 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a68ba23e411bfd..f0650344320581 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -368,8 +368,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
   MachineFunction &MF = B.getMF();
   const Function &F = MF.getFunction();
   const DataLayout &DL = F.getParent()->getDataLayout();
-  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
-  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
   Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 055bde28cd59c8..cd2a02376ecdd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1186,12 +1186,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
   // private_segment_aperture_base_hi.
   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
 
-  // FIXME: Don't use undef
-  Value *V = UndefValue::get(PointerType::get(
-                               Type::getInt8Ty(MF.getFunction().getContext()),
-                               AMDGPUAS::CONSTANT_ADDRESS));
-
-  MachinePointerInfo PtrInfo(V, StructOffset);
+  // TODO: can we be smarter about machine pointer info?
+  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
   MachineMemOperand *MMO = MF.getMachineMemOperand(
     PtrInfo,
     MachineMemOperand::MOLoad |
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index b02c4609cef307..dbc9afaa33c039 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1175,8 +1175,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
 
   // Load dword
   // TODO: can we be smarter about machine pointer info?
-  MachinePointerInfo PtrInfo(UndefValue::get(
-      Type::getInt32PtrTy(*DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS)));
+  MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS);
   SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
 
   Chain = Dst.getValue(1);
@@ -1406,8 +1405,7 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
 
   // Load dword
   // TODO: can we be smarter about machine pointer info?
-  MachinePointerInfo PtrInfo(UndefValue::get(
-      Type::getInt32PtrTy(*DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS)));
+  MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS);
   SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
 
   // Get offset within the register.
@@ -1608,9 +1606,6 @@ SDValue R600TargetLowering::LowerFormalArguments(
       continue;
     }
 
-    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                          AMDGPUAS::PARAM_I_ADDRESS);
-
     // i64 isn't a legal type, so the register type used ends up as i32, which
     // isn't expected here. It attempts to create this sextload, but it ends up
     // being invalid. Somehow this seems to work with i64 arguments, but breaks
@@ -1631,11 +1626,10 @@ SDValue R600TargetLowering::LowerFormalArguments(
     // XXX - I think PartOffset should give you this, but it seems to give the
     // size of the register which isn't useful.
 
-    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
     unsigned PartOffset = VA.getLocMemOffset();
     unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset);
 
-    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
+    MachinePointerInfo PtrInfo(AMDGPUAS::PARAM_I_ADDRESS);
     SDValue Arg = DAG.getLoad(
         ISD::UNINDEXED, Ext, VT, DL, Chain,
         DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index ed07ed100a1924..8364665dda04c9 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -578,10 +578,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
 
     // We now have the GIT ptr - now get the scratch descriptor from the entry
     // at offset 0 (or offset 16 for a compute shader).
-    PointerType *PtrTy =
-      PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
-                       AMDGPUAS::CONSTANT_ADDRESS);
-    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+    MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
     auto MMO = MF.getMachineMemOperand(PtrInfo,
                                        MachineMemOperand::MOLoad |
@@ -623,10 +620,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
       } else {
         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
 
-        PointerType *PtrTy =
-          PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
-                           AMDGPUAS::CONSTANT_ADDRESS);
-        MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+        MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
         auto MMO = MF.getMachineMemOperand(PtrInfo,
                                            MachineMemOperand::MOLoad |
                                            MachineMemOperand::MOInvariant |
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8c2012204450f6..e9f2a675de9ac8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1473,9 +1473,7 @@ SDValue SITargetLowering::lowerKernargMemParameter(
   const SDLoc &SL, SDValue Chain,
   uint64_t Offset, unsigned Align, bool Signed,
   const ISD::InputArg *Arg) const {
-  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
-  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
 
   // Try to avoid using an extload by loading earlier than the argument address,
   // and extracting the relevant bits. The load should hopefully be merged with
@@ -2875,8 +2873,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
             /*isVol = */ false, /*AlwaysInline = */ true,
             /*isTailCall = */ false, DstInfo,
-            MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
-                *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
+            MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
 
         MemOpChains.push_back(Cpy);
       } else {
@@ -4717,10 +4714,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
   // TODO: Use custom target PseudoSourceValue.
   // TODO: We should use the value from the IR intrinsic call, but it might not
   // be available and how do we get it?
-  Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
-                                              AMDGPUAS::CONSTANT_ADDRESS));
-
-  MachinePointerInfo PtrInfo(V, StructOffset);
+  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
                      MinAlign(64, StructOffset),
                      MachineMemOperand::MODereferenceable |
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
index 6a476fcec2c04b..4d405c68238466 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
@@ -8,10 +8,10 @@ define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) noun
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
   ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -27,10 +27,10 @@ define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zero
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
   ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -46,10 +46,10 @@ define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 sign
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
   ; HSA-VI:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
   ; HSA-VI:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -65,10 +65,10 @@ define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) no
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2, align 8, addrspace 4)
   ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -84,10 +84,10 @@ define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 ze
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2, align 8, addrspace 4)
   ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -103,10 +103,10 @@ define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 si
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2, align 8, addrspace 4)
   ; HSA-VI:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
   ; HSA-VI:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -122,10 +122,10 @@ define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) no
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -140,10 +140,10 @@ define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `float addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4 from `float addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -158,10 +158,10 @@ define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<2 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2 from `<2 x i8> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2, align 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store 2 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -176,10 +176,10 @@ define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<2 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4 from `<2 x i16> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -194,10 +194,10 @@ define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<2 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8 from `<2 x i32> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -212,10 +212,10 @@ define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<2 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8 from `<2 x float> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -230,10 +230,10 @@ define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<3 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 3 from `<3 x i8> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 3, align 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store 3 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -248,10 +248,10 @@ define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<3 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 6 from `<3 x i16> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 6, align 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store 6 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -266,10 +266,10 @@ define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<3 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 12 from `<3 x i32> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 12, align 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -284,10 +284,10 @@ define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<3 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 12 from `<3 x float> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 12, align 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -302,10 +302,10 @@ define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<4 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4 from `<4 x i8> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -320,10 +320,10 @@ define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<4 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8 from `<4 x i16> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -338,10 +338,10 @@ define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<4 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -356,10 +356,10 @@ define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<4 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16 from `<4 x float> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -374,10 +374,10 @@ define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<8 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8 from `<8 x i8> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -392,10 +392,10 @@ define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<8 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16 from `<8 x i16> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -410,10 +410,10 @@ define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<8 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 32 from `<8 x i32> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 32, align 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -428,10 +428,10 @@ define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<8 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 32 from `<8 x float> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 32, align 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -446,10 +446,10 @@ define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<16 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16 from `<16 x i8> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -464,10 +464,10 @@ define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16>
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<16 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 32 from `<16 x i16> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 32, align 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store 32 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -482,10 +482,10 @@ define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<16 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 64 from `<16 x i32> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 64, align 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -500,10 +500,10 @@ define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out,
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `<16 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 64 from `<16 x float> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 64, align 16, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -518,10 +518,10 @@ define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwi
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
   store i64 %a, i64 addrspace(1)* %out, align 8
@@ -535,10 +535,10 @@ define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `double addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8 from `double addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
 entry:
@@ -553,10 +553,10 @@ define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i1 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
   ; HSA-VI:   G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store 1 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
   store i1 %x, i1 addrspace(1)* %out, align 1
@@ -570,10 +570,10 @@ define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
   ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
   ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -589,10 +589,10 @@ define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
   ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
   ; HSA-VI:   G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -608,10 +608,10 @@ define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
   ; HSA-VI:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
   ; HSA-VI:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -627,10 +627,10 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
   ; HSA-VI:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
   ; HSA-VI:   G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
   ; HSA-VI:   S_ENDPGM 0
@@ -674,15 +674,15 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 16, addrspace 4)
   ; HSA-VI:   [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s128), 0
   ; HSA-VI:   [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s128), 64
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i8 addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 16, addrspace 4)
   ; HSA-VI:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
   ; HSA-VI:   [[GEP2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
-  ; HSA-VI:   [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP2]](p4) :: (dereferenceable invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP2]](p4) :: (dereferenceable invariant load 16, align 8, addrspace 4)
   ; HSA-VI:   [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s128), 0
   ; HSA-VI:   [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s128), 64
   ; HSA-VI:   S_ENDPGM 0
@@ -706,15 +706,15 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
   ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
-  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 12, align 16, addrspace 4)
   ; HSA-VI:   [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s96), 0
   ; HSA-VI:   [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s96), 32
   ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
   ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
-  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1 from `i8 addrspace(4)* undef`, align 4, addrspace 4)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 4, addrspace 4)
   ; HSA-VI:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 13
   ; HSA-VI:   [[GEP2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
-  ; HSA-VI:   [[LOAD2:%[0-9]+]]:_(s96) = G_LOAD [[GEP2]](p4) :: (dereferenceable invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 1, addrspace 4)
+  ; HSA-VI:   [[LOAD2:%[0-9]+]]:_(s96) = G_LOAD [[GEP2]](p4) :: (dereferenceable invariant load 12, align 1, addrspace 4)
   ; HSA-VI:   [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s96), 0
   ; HSA-VI:   [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s96), 32
   ; HSA-VI:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
index 2dec254171bc0c..dbf456f0c6f605 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
@@ -178,7 +178,7 @@ body: |
     ; VI: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
     ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
     ; VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64)
-    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 68, addrspace 4)
+    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4, addrspace 4)
     ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p5), [[C]]
     ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5)
     ; VI: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
@@ -261,7 +261,7 @@ body: |
     ; VI: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
     ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
     ; VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64)
-    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 64, align 64, addrspace 4)
+    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4, align 64, addrspace 4)
     ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p3), [[C]]
     ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3)
     ; VI: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
@@ -467,14 +467,14 @@ body: |
     ; VI: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
     ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
     ; VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64)
-    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 64, align 64, addrspace 4)
+    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4, align 64, addrspace 4)
     ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]]
     ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
     ; VI: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
     ; VI: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]]
     ; VI: [[COPY3:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
     ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY3]], [[C2]](s64)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 64, align 64, addrspace 4)
+    ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 64, addrspace 4)
     ; VI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]]
     ; VI: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
     ; VI: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[LOAD1]](s32)