From 684b43c0cfb1092a65c237b39d0662bfe0a2c97a Mon Sep 17 00:00:00 2001
From: Juneyoung Lee <aqjune@gmail.com>
Date: Wed, 26 Aug 2020 04:50:29 +0900
Subject: [PATCH 01/19] [IR] Add NoUndef attribute to Intrinsics.td

This patch adds NoUndef to Intrinsics.td.
The attribute is attached to llvm.assume's operand, because llvm.assume(undef)
is UB.
It is attached to pointer operands of several memory accessing intrinsics
as well.

This change makes ValueTracking::getGuaranteedNonPoisonOps' intrinsic check
unnecessary, so it is removed.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D86576
---
 llvm/include/llvm/IR/Intrinsics.td             | 18 +++++++++++++-----
 llvm/lib/Analysis/ValueTracking.cpp            | 10 ----------
 .../Transforms/EarlyCSE/invariant.start.ll     |  2 ++
 llvm/utils/TableGen/CodeGenIntrinsics.h        |  1 +
 llvm/utils/TableGen/CodeGenTarget.cpp          |  3 +++
 llvm/utils/TableGen/IntrinsicEmitter.cpp       |  6 ++++++
 mlir/test/Target/llvmir-intrinsics.mlir        |  8 ++++----
 7 files changed, 29 insertions(+), 19 deletions(-)
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8ece1f8b1b1231..620dfb920f24d8 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -83,6 +83,11 @@ class NoAlias<AttrIndex idx> : IntrinsicProperty {
   int ArgNo = idx.Value;
 }
 
+// NoUndef - The specified argument is neither undef nor poison.
+class NoUndef<AttrIndex idx> : IntrinsicProperty {
+  int ArgNo = idx.Value;
+}
+
 class Align<AttrIndex idx, int align> : IntrinsicProperty {
   int ArgNo = idx.Value;
   int Align = align;
@@ -515,7 +520,8 @@ def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
 
 // The assume intrinsic is marked as arbitrarily writing so that proper
 // control dependencies will be maintained.
-def int_assume        : Intrinsic<[], [llvm_i1_ty], [IntrWillReturn]>;
+def int_assume        : Intrinsic<[], [llvm_i1_ty], [IntrWillReturn,
+                                                     NoUndef<ArgIndex<0>>]>;
 
 // Stack Protector Intrinsic - The stackprotector intrinsic writes the stack
 // guard to the correct place on the stack frame.
@@ -1347,26 +1353,28 @@ def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
                                       LLVMAnyPointerType<LLVMMatchType<0>>,
                                       llvm_i32_ty,
                                       LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                 [IntrArgMemOnly, IntrWillReturn, ImmArg<ArgIndex<2>>]>;
+                                 [IntrArgMemOnly, IntrWillReturn,
+                                  NoUndef<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 def int_masked_load  : Intrinsic<[llvm_anyvector_ty],
                                  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty,
                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
                                  [IntrReadMem, IntrArgMemOnly, IntrWillReturn,
-                                  ImmArg<ArgIndex<1>>]>;
+                                  NoUndef<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
 def int_masked_gather: Intrinsic<[llvm_anyvector_ty],
                                  [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                   LLVMMatchType<0>],
                                  [IntrReadMem, IntrWillReturn,
-                                  ImmArg<ArgIndex<1>>]>;
+                                  NoUndef<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
 def int_masked_scatter: Intrinsic<[],
                                   [llvm_anyvector_ty,
                                    LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
                                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                  [IntrWillReturn, ImmArg<ArgIndex<2>>]>;
+                                  [IntrWillReturn, NoUndef<ArgIndex<1>>,
+                                   ImmArg<ArgIndex<2>>]>;
 
 def int_masked_expandload: Intrinsic<[llvm_anyvector_ty],
                                      [LLVMPointerToElt<0>,
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 2835d2f06edd3f..36998cd9069713 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5093,16 +5093,6 @@ void llvm::getGuaranteedNonPoisonOps(const Instruction *I,
 
     case Instruction::Call:
     case Instruction::Invoke: {
-      if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-        switch (II->getIntrinsicID()) {
-        case Intrinsic::assume:
-          Operands.insert(II->getArgOperand(0));
-          break;
-        default:
-          break;
-        }
-      }
-
       const CallBase *CB = cast<CallBase>(I);
       if (CB->isIndirectCall())
         Operands.insert(CB->getCalledOperand());
diff --git a/llvm/test/Transforms/EarlyCSE/invariant.start.ll b/llvm/test/Transforms/EarlyCSE/invariant.start.ll
index d26ba496223f98..2202c09c1a0e99 100644
--- a/llvm/test/Transforms/EarlyCSE/invariant.start.ll
+++ b/llvm/test/Transforms/EarlyCSE/invariant.start.ll
@@ -525,3 +525,5 @@ define i32 @test_invariant_load_scope(i32* %p) {
   %sub = sub i32 %v1, %v2
   ret i32 %sub
 }
+
+; USE_ASSUME: declare void @llvm.assume(i1 noundef)
diff --git a/llvm/utils/TableGen/CodeGenIntrinsics.h b/llvm/utils/TableGen/CodeGenIntrinsics.h
index 84ed0dc482fc24..c469f662a42d6d 100644
--- a/llvm/utils/TableGen/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/CodeGenIntrinsics.h
@@ -148,6 +148,7 @@ struct CodeGenIntrinsic {
   enum ArgAttrKind {
     NoCapture,
     NoAlias,
+    NoUndef,
     Returned,
     ReadOnly,
     WriteOnly,
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index 7824d8d1b34a0a..889110a2dc2128 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -846,6 +846,9 @@ void CodeGenIntrinsic::setProperty(Record *R) {
   } else if (R->isSubClassOf("NoAlias")) {
     unsigned ArgNo = R->getValueAsInt("ArgNo");
     ArgumentAttributes.emplace_back(ArgNo, NoAlias, 0);
+  } else if (R->isSubClassOf("NoUndef")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    ArgumentAttributes.emplace_back(ArgNo, NoUndef, 0);
   } else if (R->isSubClassOf("Returned")) {
     unsigned ArgNo = R->getValueAsInt("ArgNo");
     ArgumentAttributes.emplace_back(ArgNo, Returned, 0);
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 7e41914941498f..4e368fac2c834d 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -687,6 +687,12 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
             OS << "Attribute::NoAlias";
             addComma = true;
             break;
+          case CodeGenIntrinsic::NoUndef:
+            if (addComma)
+              OS << ",";
+            OS << "Attribute::NoUndef";
+            addComma = true;
+            break;
           case CodeGenIntrinsic::Returned:
             if (addComma)
               OS << ",";
diff --git a/mlir/test/Target/llvmir-intrinsics.mlir b/mlir/test/Target/llvmir-intrinsics.mlir
index 8a598e67d17b89..b4e5ca67943f46 100644
--- a/mlir/test/Target/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/llvmir-intrinsics.mlir
@@ -322,10 +322,10 @@ llvm.func @memcpy_test(%arg0: !llvm.i32, %arg1: !llvm.i1, %arg2: !llvm.ptr<i8>,
 // CHECK-DAG: declare <48 x float> @llvm.matrix.column.major.load.v48f32(float* nocapture, i64, i1 immarg, i32 immarg, i32 immarg)
 // CHECK-DAG: declare void @llvm.matrix.column.major.store.v48f32(<48 x float>, float* nocapture writeonly, i64, i1 immarg, i32 immarg, i32 immarg)
 // CHECK-DAG: declare <7 x i1> @llvm.get.active.lane.mask.v7i1.i64(i64, i64)
-// CHECK-DAG: declare <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>*, i32 immarg, <7 x i1>, <7 x float>)
-// CHECK-DAG: declare void @llvm.masked.store.v7f32.p0v7f32(<7 x float>, <7 x float>*, i32 immarg, <7 x i1>)
-// CHECK-DAG: declare <7 x float> @llvm.masked.gather.v7f32.v7p0f32(<7 x float*>, i32 immarg, <7 x i1>, <7 x float>)
-// CHECK-DAG: declare void @llvm.masked.scatter.v7f32.v7p0f32(<7 x float>, <7 x float*>, i32 immarg, <7 x i1>)
+// CHECK-DAG: declare <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>* noundef, i32 immarg, <7 x i1>, <7 x float>)
+// CHECK-DAG: declare void @llvm.masked.store.v7f32.p0v7f32(<7 x float>, <7 x float>* noundef, i32 immarg, <7 x i1>)
+// CHECK-DAG: declare <7 x float> @llvm.masked.gather.v7f32.v7p0f32(<7 x float*> noundef, i32 immarg, <7 x i1>, <7 x float>)
+// CHECK-DAG: declare void @llvm.masked.scatter.v7f32.v7p0f32(<7 x float>, <7 x float*> noundef, i32 immarg, <7 x i1>)
 // CHECK-DAG: declare <7 x float> @llvm.masked.expandload.v7f32(float*, <7 x i1>, <7 x float>)
 // CHECK-DAG: declare void @llvm.masked.compressstore.v7f32(<7 x float>, float*, <7 x i1>)
 // CHECK-DAG: declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)

From c67ccf5fafc8c035f152ce30115bbdacf23530d5 Mon Sep 17 00:00:00 2001
From: Wei Mi <wmi@google.com>
Date: Mon, 24 Aug 2020 22:59:20 -0700
Subject: [PATCH 02/19] [SampleFDO] Enhance profile remapping support for
 searching inline instance and indirect call promotion candidate.

Profile remapping is a feature to match a function in the module with its
profile in sample profile if the function name and the name in profile look
different but are equivalent using given remapping rules. This is a useful
feature to keep the performance stable by specifying some remapping rules
when sampleFDO targets are going through some large scale function signature
change.

However, currently profile remapping support is only valid for outline
function profile in SampleFDO. It cannot match a callee with an inline
instance profile if they have different but equivalent names. We found
that without the support for inline instance profile, remapping is less
effective for some large scale change.

To add that support, before any remapping lookup happens, all the names
in the profile will be inserted into remapper and the Key to the name
mapping will be recorded in a map called NameMap in the remapper. During
name lookup, a Key will be returned for the given name and it will be used
to extract an equivalent name in the profile from NameMap. So with the help
of the NameMap, we can translate any given name to an equivalent name in
the profile if it exists. Whenever we try to match a name in the module to
a name in the profile, we will try the match with the original name first,
and if it doesn't match, we will use the equivalent name got from remapper
to try the match for another time. In this way, the patch can enhance the
profile remapping support for searching inline instance and searching
indirect call promotion candidate.

In a planned large scale change of int64 type (long long) to int64_t (long),
we found the performance of a google internal benchmark degraded by 2% if
nothing was done. If existing profile remapping was enabled, the performance
degradation dropped to 1.2%. If the profile remapping with the current patch
was enabled, the performance degradation further dropped to 0.14% (Note the
experiment was done before searching indirect call promotion candidate was
added. We hope with the remapping support of searching indirect call promotion
candidate, the degradation can drop to 0% in the end. It will be evaluated
post commit).

Differential Revision: https://reviews.llvm.org/D86332
---
 llvm/include/llvm/ProfileData/SampleProf.h    | 49 +++++-------
 .../llvm/ProfileData/SampleProfReader.h       | 26 +++++--
 llvm/lib/ProfileData/SampleProf.cpp           | 55 +++++++++++++-
 llvm/lib/ProfileData/SampleProfReader.cpp     | 18 +++--
 llvm/lib/Transforms/IPO/SampleProfile.cpp     | 30 +++++---
 .../SampleProfile/Inputs/remap-2.prof         | 16 ++++
 llvm/test/Transforms/SampleProfile/remap-2.ll | 74 +++++++++++++++++++
 llvm/unittests/ProfileData/SampleProfTest.cpp | 51 ++++++++++++-
 8 files changed, 257 insertions(+), 62 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/remap-2.ll

diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index fa5326038ada08..aca941b2da15a8 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -342,6 +342,7 @@ class SampleRecord {
 raw_ostream &operator<<(raw_ostream &OS, const SampleRecord &Sample);
 
 class FunctionSamples;
+class SampleProfileReaderItaniumRemapper;
 
 using BodySampleMap = std::map<LineLocation, SampleRecord>;
 // NOTE: Using a StringMap here makes parsed profiles consume around 17% more
@@ -428,35 +429,15 @@ class FunctionSamples {
     return &iter->second;
   }
 
-  /// Returns a pointer to FunctionSamples at the given callsite location \p Loc
-  /// with callee \p CalleeName. If no callsite can be found, relax the
-  /// restriction to return the FunctionSamples at callsite location \p Loc
-  /// with the maximum total sample count.
-  const FunctionSamples *findFunctionSamplesAt(const LineLocation &Loc,
-                                               StringRef CalleeName) const {
-    std::string CalleeGUID;
-    CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID);
-
-    auto iter = CallsiteSamples.find(Loc);
-    if (iter == CallsiteSamples.end())
-      return nullptr;
-    auto FS = iter->second.find(CalleeName);
-    if (FS != iter->second.end())
-      return &FS->second;
-    // If we cannot find exact match of the callee name, return the FS with
-    // the max total count. Only do this when CalleeName is not provided,
-    // i.e., only for indirect calls.
-    if (!CalleeName.empty())
-      return nullptr;
-    uint64_t MaxTotalSamples = 0;
-    const FunctionSamples *R = nullptr;
-    for (const auto &NameFS : iter->second)
-      if (NameFS.second.getTotalSamples() >= MaxTotalSamples) {
-        MaxTotalSamples = NameFS.second.getTotalSamples();
-        R = &NameFS.second;
-      }
-    return R;
-  }
+  /// Returns a pointer to FunctionSamples at the given callsite location
+  /// \p Loc with callee \p CalleeName. If no callsite can be found, relax
+  /// the restriction to return the FunctionSamples at callsite location
+  /// \p Loc with the maximum total sample count. If \p Remapper is not
+  /// nullptr, use \p Remapper to find FunctionSamples with equivalent name
+  /// as \p CalleeName.
+  const FunctionSamples *
+  findFunctionSamplesAt(const LineLocation &Loc, StringRef CalleeName,
+                        SampleProfileReaderItaniumRemapper *Remapper) const;
 
   bool empty() const { return TotalSamples == 0; }
 
@@ -630,7 +611,11 @@ class FunctionSamples {
   /// tree nodes in the profile.
   ///
   /// \returns the FunctionSamples pointer to the inlined instance.
-  const FunctionSamples *findFunctionSamples(const DILocation *DIL) const;
+  /// If \p Remapper is not nullptr, it will be used to find matching
+  /// FunctionSamples with not exactly the same but equivalent name.
+  const FunctionSamples *findFunctionSamples(
+      const DILocation *DIL,
+      SampleProfileReaderItaniumRemapper *Remapper = nullptr) const;
 
   static SampleProfileFormat Format;
 
@@ -648,6 +633,10 @@ class FunctionSamples {
     return UseMD5 ? std::stoull(Name.data()) : Function::getGUID(Name);
   }
 
+  // Find all the names in the current FunctionSamples including names in
+  // all the inline instances and names of call targets.
+  void findAllNames(DenseSet<StringRef> &NameSet) const;
+
 private:
   /// Mangled name of the function.
   StringRef Name;
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index 0e8ee7696c543e..385ac820f5b5be 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -208,6 +208,7 @@
 #ifndef LLVM_PROFILEDATA_SAMPLEPROFREADER_H
 #define LLVM_PROFILEDATA_SAMPLEPROFREADER_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -275,15 +276,18 @@ class SampleProfileReaderItaniumRemapper {
     return Remappings->lookup(FunctionName);
   }
 
-  /// Return the samples collected for function \p F if remapper knows
-  /// it is present in SampleMap.
-  FunctionSamples *getSamplesFor(StringRef FunctionName);
+  /// Return the equivalent name in the profile for \p FunctionName if
+  /// it exists.
+  Optional<StringRef> lookUpNameInProfile(StringRef FunctionName);
 
 private:
   // The buffer holding the content read from remapping file.
   std::unique_ptr<MemoryBuffer> Buffer;
   std::unique_ptr<SymbolRemappingReader> Remappings;
-  DenseMap<SymbolRemappingReader::Key, FunctionSamples *> SampleMap;
+  // Map remapping key to the name in the profile. By looking up the
+  // key in the remapper, a given new name can be mapped to the
+  // cannonical name using the NameMap.
+  DenseMap<SymbolRemappingReader::Key, StringRef> NameMap;
   // The Reader the remapper is servicing.
   SampleProfileReader &Reader;
   // Indicate whether remapping has been applied to the profile read
@@ -370,15 +374,19 @@ class SampleProfileReader {
 
   /// Return the samples collected for function \p F.
   virtual FunctionSamples *getSamplesFor(StringRef Fname) {
-    if (Remapper) {
-      if (auto FS = Remapper->getSamplesFor(Fname))
-        return FS;
-    }
     std::string FGUID;
     Fname = getRepInFormat(Fname, useMD5(), FGUID);
     auto It = Profiles.find(Fname);
     if (It != Profiles.end())
       return &It->second;
+
+    if (Remapper) {
+      if (auto NameInProfile = Remapper->lookUpNameInProfile(Fname)) {
+        auto It = Profiles.find(*NameInProfile);
+        if (It != Profiles.end())
+          return &It->second;
+      }
+    }
     return nullptr;
   }
 
@@ -423,6 +431,8 @@ class SampleProfileReader {
   /// Return whether names in the profile are all MD5 numbers.
   virtual bool useMD5() { return false; }
 
+  SampleProfileReaderItaniumRemapper *getRemapper() { return Remapper.get(); }
+
 protected:
   /// Map every function to its associated profile.
   ///
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
index e5d0fdba5fc4a0..6e0542f6d433fd 100644
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
@@ -174,8 +175,8 @@ unsigned FunctionSamples::getOffset(const DILocation *DIL) {
       0xffff;
 }
 
-const FunctionSamples *
-FunctionSamples::findFunctionSamples(const DILocation *DIL) const {
+const FunctionSamples *FunctionSamples::findFunctionSamples(
+    const DILocation *DIL, SampleProfileReaderItaniumRemapper *Remapper) const {
   assert(DIL);
   SmallVector<std::pair<LineLocation, StringRef>, 10> S;
 
@@ -190,11 +191,59 @@ FunctionSamples::findFunctionSamples(const DILocation *DIL) const {
     return this;
   const FunctionSamples *FS = this;
   for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
-    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
+    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second, Remapper);
   }
   return FS;
 }
 
+void FunctionSamples::findAllNames(DenseSet<StringRef> &NameSet) const {
+  NameSet.insert(Name);
+  for (const auto &BS : BodySamples)
+    for (const auto &TS : BS.second.getCallTargets())
+      NameSet.insert(TS.getKey());
+
+  for (const auto &CS : CallsiteSamples) {
+    for (const auto &NameFS : CS.second) {
+      NameSet.insert(NameFS.first);
+      NameFS.second.findAllNames(NameSet);
+    }
+  }
+}
+
+const FunctionSamples *FunctionSamples::findFunctionSamplesAt(
+    const LineLocation &Loc, StringRef CalleeName,
+    SampleProfileReaderItaniumRemapper *Remapper) const {
+  std::string CalleeGUID;
+  CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID);
+
+  auto iter = CallsiteSamples.find(Loc);
+  if (iter == CallsiteSamples.end())
+    return nullptr;
+  auto FS = iter->second.find(CalleeName);
+  if (FS != iter->second.end())
+    return &FS->second;
+  if (Remapper) {
+    if (auto NameInProfile = Remapper->lookUpNameInProfile(CalleeName)) {
+      auto FS = iter->second.find(*NameInProfile);
+      if (FS != iter->second.end())
+        return &FS->second;
+    }
+  }
+  // If we cannot find exact match of the callee name, return the FS with
+  // the max total count. Only do this when CalleeName is not provided,
+  // i.e., only for indirect calls.
+  if (!CalleeName.empty())
+    return nullptr;
+  uint64_t MaxTotalSamples = 0;
+  const FunctionSamples *R = nullptr;
+  for (const auto &NameFS : iter->second)
+    if (NameFS.second.getTotalSamples() >= MaxTotalSamples) {
+      MaxTotalSamples = NameFS.second.getTotalSamples();
+      R = &NameFS.second;
+    }
+  return R;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); }
 #endif
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 03f1ac190b91c7..59fae9e236f377 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -1291,18 +1291,22 @@ void SampleProfileReaderItaniumRemapper::applyRemapping(LLVMContext &Ctx) {
   }
 
   assert(Remappings && "should be initialized while creating remapper");
-  for (auto &Sample : Reader.getProfiles())
-    if (auto Key = Remappings->insert(Sample.first()))
-      SampleMap.insert({Key, &Sample.second});
+  for (auto &Sample : Reader.getProfiles()) {
+    DenseSet<StringRef> NamesInSample;
+    Sample.second.findAllNames(NamesInSample);
+    for (auto &Name : NamesInSample)
+      if (auto Key = Remappings->insert(Name))
+        NameMap.insert({Key, Name});
+  }
 
   RemappingApplied = true;
 }
 
-FunctionSamples *
-SampleProfileReaderItaniumRemapper::getSamplesFor(StringRef Fname) {
+Optional<StringRef>
+SampleProfileReaderItaniumRemapper::lookUpNameInProfile(StringRef Fname) {
   if (auto Key = Remappings->lookup(Fname))
-    return SampleMap.lookup(Key);
-  return nullptr;
+    return NameMap.lookup(Key);
+  return None;
 }
 
 /// Prepare a memory buffer for the contents of \p Filename.
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 2c520a1b5b6b7b..997701e5721f57 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -840,7 +840,7 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
 
   return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL),
                                                 DIL->getBaseDiscriminator()),
-                                   CalleeName);
+                                   CalleeName, Reader->getRemapper());
 }
 
 /// Returns a vector of FunctionSamples that are the indirect call targets
@@ -903,7 +903,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
 
   auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
   if (it.second)
-    it.first->second = Samples->findFunctionSamples(DIL);
+    it.first->second = Samples->findFunctionSamples(DIL, Reader->getRemapper());
   return it.first->second;
 }
 
@@ -1050,24 +1050,23 @@ bool SampleProfileLoader::inlineHotFunctions(
                                      PSI->getOrCompHotCountThreshold());
             continue;
           }
-          auto CalleeFunctionName = FS->getFuncName();
+          if (!callsiteIsHot(FS, PSI))
+            continue;
+
+          const char *Reason = "Callee function not available";
+          // R->getValue() != &F is to prevent promoting a recursive call.
           // If it is a recursive call, we do not inline it as it could bloat
           // the code exponentially. There is way to better handle this, e.g.
           // clone the caller first, and inline the cloned caller if it is
           // recursive. As llvm does not inline recursive calls, we will
           // simply ignore it instead of handling it explicitly.
-          if (CalleeFunctionName == F.getName())
-            continue;
-
-          if (!callsiteIsHot(FS, PSI))
-            continue;
-
-          const char *Reason = "Callee function not available";
+          auto CalleeFunctionName = FS->getFuncName();
           auto R = SymbolMap.find(CalleeFunctionName);
           if (R != SymbolMap.end() && R->getValue() &&
               !R->getValue()->isDeclaration() &&
               R->getValue()->getSubprogram() &&
               R->getValue()->hasFnAttribute("use-sample-profile") &&
+              R->getValue() != &F &&
               isLegalToPromote(*I, R->getValue(), &Reason)) {
             uint64_t C = FS->getEntrySamples();
             auto &DI =
@@ -1854,7 +1853,6 @@ bool SampleProfileLoader::doInitialization(Module &M,
                                            FunctionAnalysisManager *FAM) {
   auto &Ctx = M.getContext();
 
-  std::unique_ptr<SampleProfileReaderItaniumRemapper> RemapReader;
   auto ReaderOrErr =
       SampleProfileReader::create(Filename, Ctx, RemappingFilename);
   if (std::error_code EC = ReaderOrErr.getError()) {
@@ -1910,6 +1908,7 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
   for (const auto &I : Reader->getProfiles())
     TotalCollectedSamples += I.second.getTotalSamples();
 
+  auto Remapper = Reader->getRemapper();
   // Populate the symbol map.
   for (const auto &N_F : M.getValueSymbolTable()) {
     StringRef OrigName = N_F.getKey();
@@ -1927,6 +1926,15 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
       // to nullptr to avoid confusion.
       if (!r.second)
         r.first->second = nullptr;
+      OrigName = NewName;
+    }
+    // Insert the remapped names into SymbolMap.
+    if (Remapper) {
+      if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
+        if (*MapName == OrigName)
+          continue;
+        SymbolMap.insert(std::make_pair(*MapName, F));
+      }
     }
   }
 
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof b/llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof
new file mode 100644
index 00000000000000..6e0f9066dbf5d8
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof
@@ -0,0 +1,16 @@
+test:15680:2500
+ 1: 100
+ 4: 100
+ 5: 3000 xoo:1000
+ 5: _ZN3foo3barERKN1N1XINS_4quuxEEE:2000
+  1: 2000
+ 6: _ZN1N1XE:2500
+  1: 2500
+
+_ZN1N1X1YE:15680:2500
+ 1: 100
+ 4: 100
+ 5: 3000 xoo:1000
+ 5: _ZN1N1X1YE:2000
+  1: 2000
+
diff --git a/llvm/test/Transforms/SampleProfile/remap-2.ll b/llvm/test/Transforms/SampleProfile/remap-2.ll
new file mode 100644
index 00000000000000..668fa275e1332b
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/remap-2.ll
@@ -0,0 +1,74 @@
+; RUN: opt %s -passes=sample-profile -sample-profile-file=%S/Inputs/remap-2.prof -sample-profile-remapping-file=%S/Inputs/remap.map -S | FileCheck %s
+; Check profile remapping works for searching inline instance, searching
+; indirect call promotion candidate and prevent recursive inline.
+
+@x.addr = common global i32 zeroinitializer, align 16
+@y.addr = common global i32 zeroinitializer, align 16
+
+define i32 @_ZN3foo3barERKN1M1XINS_6detail3quxEEE() #0 !dbg !9 {
+entry:
+  %t0 = load i32, i32* @x.addr, align 4
+  %t1 = load i32, i32* @y.addr, align 4
+  %add = add nsw i32 %t0, %t1
+  ret i32 %add
+}
+
+define i32 @_ZN1M1XE() #0 !dbg !10 {
+entry:
+  %t0 = load i32, i32* @x.addr, align 4
+  %t1 = load i32, i32* @y.addr, align 4
+  %sub = sub nsw i32 %t0, %t1
+  ret i32 %sub
+}
+
+define void @test(i32 ()*) #0 !dbg !4 {
+  %t2 = alloca i32 ()*
+  store i32 ()* %0, i32 ()** %t2
+  %t3 = load i32 ()*, i32 ()** %t2
+; Check call i32 %t3 has been indirect call promoted and call i32 @_ZN1M1XE
+; has been inlined.
+; CHECK-LABEL: @test(
+; CHECK: icmp eq i32 ()* %t3, @_ZN3foo3barERKN1M1XINS_6detail3quxEEE
+; CHECK-NOT: call i32 @_ZN1M1XE
+  %t4 = call i32 %t3(), !dbg !7
+  %t5 = call i32 @_ZN1M1XE(), !dbg !8
+  ret void
+}
+
+define void @_ZN1M1X1YE(i32 ()*) #0 !dbg !11 {
+  %t2 = alloca i32 ()*
+  store i32 ()* %0, i32 ()** %t2
+  %t3 = load i32 ()*, i32 ()** %t2
+; Check call i32 %t3 has got its profile but is not indirect call promoted
+; because the promotion candidate is a recursive call to the current function.
+; CHECK-LABEL: @_ZN1M1X1YE(
+; CHECK: call i32 %t3(), {{.*}} !prof ![[PROFID:[0-9]+]]
+; CHECK-NOT: icmp eq i32 ()* %t3, @_ZN1M1X1YE
+  %t4 = call i32 %t3(), !dbg !12
+  ret void
+}
+
+; CHECK: ![[PROFID]] = !{!"VP", i32 0, i64 3000
+
+attributes #0 = { "use-sample-profile" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !14}
+!llvm.ident = !{!15}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "calls.cc", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 8, scope: !4)
+!8 = !DILocation(line: 9, scope: !4)
+!9 = distinct !DISubprogram(name: "_ZN3foo3barERKN1M1XINS_6detail3quxEEE", line: 15, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!10 = distinct !DISubprogram(name: "_ZN1M1XE", line: 20, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!11 = distinct !DISubprogram(name: "_ZN1M1X1YE", line: 25, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!12 = !DILocation(line: 30, scope: !11)
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !{i32 1, !"Debug Info Version", i32 3}
+!15 = !{!"clang version 3.5 "}
+
diff --git a/llvm/unittests/ProfileData/SampleProfTest.cpp b/llvm/unittests/ProfileData/SampleProfTest.cpp
index 26b01e4b284999..cef5f042dd375a 100644
--- a/llvm/unittests/ProfileData/SampleProfTest.cpp
+++ b/llvm/unittests/ProfileData/SampleProfTest.cpp
@@ -89,8 +89,8 @@ struct SampleProfTest : ::testing::Test {
     auto VerifySummary = [IsPartialProfile, PartialProfileRatio](
                              ProfileSummary &Summary) mutable {
       ASSERT_EQ(ProfileSummary::PSK_Sample, Summary.getKind());
-      ASSERT_EQ(137392u, Summary.getTotalCount());
-      ASSERT_EQ(8u, Summary.getNumCounts());
+      ASSERT_EQ(138211u, Summary.getTotalCount());
+      ASSERT_EQ(10u, Summary.getNumCounts());
       ASSERT_EQ(4u, Summary.getNumFunctions());
       ASSERT_EQ(1437u, Summary.getMaxFunctionCount());
       ASSERT_EQ(60351u, Summary.getMaxCount());
@@ -112,7 +112,7 @@ struct SampleProfTest : ::testing::Test {
       ASSERT_EQ(60000u, EightyPerc->MinCount);
       ASSERT_EQ(12557u, NinetyPerc->MinCount);
       ASSERT_EQ(12557u, NinetyFivePerc->MinCount);
-      ASSERT_EQ(610u, NinetyNinePerc->MinCount);
+      ASSERT_EQ(600u, NinetyNinePerc->MinCount);
     };
     VerifySummary(Summary);
 
@@ -155,6 +155,22 @@ struct SampleProfTest : ::testing::Test {
     FooSamples.addBodySamples(8, 0, 60351);
     FooSamples.addBodySamples(10, 0, 605);
 
+    // Add inline instance with name "_Z3gooi".
+    StringRef GooName("_Z3gooi");
+    auto &GooSamples =
+        FooSamples.functionSamplesAt(LineLocation(7, 0))[GooName.str()];
+    GooSamples.setName(GooName);
+    GooSamples.addTotalSamples(502);
+    GooSamples.addBodySamples(3, 0, 502);
+
+    // Add inline instance with name "_Z3hooi".
+    StringRef HooName("_Z3hooi");
+    auto &HooSamples =
+        GooSamples.functionSamplesAt(LineLocation(9, 0))[HooName.str()];
+    HooSamples.setName(HooName);
+    HooSamples.addTotalSamples(317);
+    HooSamples.addBodySamples(4, 0, 317);
+
     StringRef BarName("_Z3bari");
     FunctionSamples BarSamples;
     BarSamples.setName(BarName);
@@ -197,6 +213,8 @@ struct SampleProfTest : ::testing::Test {
       createRemapFile(RemapPath, RemapFile);
       FooName = "_Z4fauxi";
       BarName = "_Z3barl";
+      GooName = "_Z3gool";
+      HooName = "_Z3hool";
     }
 
     M.getOrInsertFunction(FooName, fn_type);
@@ -235,6 +253,33 @@ struct SampleProfTest : ::testing::Test {
     ASSERT_EQ(7711u, ReadFooSamples->getTotalSamples());
     ASSERT_EQ(610u, ReadFooSamples->getHeadSamples());
 
+    // Try to find a FunctionSamples with GooName at given callsites containing
+    // inline instance for GooName. Test the correct FunctionSamples can be
+    // found with Remapper support.
+    const FunctionSamples *ReadGooSamples =
+        ReadFooSamples->findFunctionSamplesAt(LineLocation(7, 0), GooName,
+                                              Reader->getRemapper());
+    ASSERT_TRUE(ReadGooSamples != nullptr);
+    ASSERT_EQ(502u, ReadGooSamples->getTotalSamples());
+
+    // Try to find a FunctionSamples with GooName at given callsites containing
+    // no inline instance for GooName. Test no FunctionSamples will be
+    // found with Remapper support.
+    const FunctionSamples *ReadGooSamplesAgain =
+        ReadFooSamples->findFunctionSamplesAt(LineLocation(9, 0), GooName,
+                                              Reader->getRemapper());
+    ASSERT_TRUE(ReadGooSamplesAgain == nullptr);
+
+    // The inline instance of Hoo is inside of the inline instance of Goo.
+    // Try to find a FunctionSamples with HooName at given callsites containing
+    // inline instance for HooName. Test the correct FunctionSamples can be
+    // found with Remapper support.
+    const FunctionSamples *ReadHooSamples =
+        ReadGooSamples->findFunctionSamplesAt(LineLocation(9, 0), HooName,
+                                              Reader->getRemapper());
+    ASSERT_TRUE(ReadHooSamples != nullptr);
+    ASSERT_EQ(317u, ReadHooSamples->getTotalSamples());
+
     FunctionSamples *ReadBarSamples = Reader->getSamplesFor(BarName);
     ASSERT_TRUE(ReadBarSamples != nullptr);
     if (!UseMD5) {

From f78687df9b790b4f4177a72cbd25b49d14c437b4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 26 Aug 2020 10:45:07 -0400
Subject: [PATCH 03/19] AMDGPU: Don't assert on misaligned DS read2/write2
 offsets

This would assert with unaligned DS access enabled. The offset may not
be aligned. Theoretically the pattern predicate should check the
memory alignment, although it is possible to have the memory be
aligned but not the immediate offset.

In this case I would expect it to use ds_{read|write}_b64 with
unaligned access, but am not clear if there's a reason it doesn't.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  4 +--
 llvm/test/CodeGen/AMDGPU/ds_read2.ll          | 27 ++++++++++++++++---
 llvm/test/CodeGen/AMDGPU/ds_write2.ll         | 18 ++++++++++++-
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2a5dac1f1e1068..151b1bdd553817 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1304,9 +1304,9 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
     unsigned OffsetValue0 = CAddr->getZExtValue() / Align;
     unsigned OffsetValue1 = OffsetValue0 + 1;
-    assert(Align * OffsetValue0 == CAddr->getZExtValue());
+    bool OffsetIsAligned = Align * OffsetValue0 == CAddr->getZExtValue();
 
-    if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1)) {
+    if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1) && OffsetIsAligned) {
       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
       MachineSDNode *MovZero =
           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 2454efaa5e354a..47ae95eefea9cf 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
 
 ; FIXME: We don't get cases where the address was an SGPR because we
 ; get a copy to the address register for each one.
@@ -317,7 +318,9 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out)
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-NOT: ds_read2_b32
+; CI-COUNT-4: ds_read_u8
+; GFX9-ALIGNED-4: ds_read_u8
+; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -336,7 +339,9 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-NOT: ds_read2_b32
+; CI-COUNT-2: ds_read_u16
+; GFX9-ALIGNED-2: ds_read_u16
+; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -655,6 +660,22 @@ define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrsp
   ret <2 x float> %r1
 }
 
+@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
+
+; GCN-LABEL: {{^}}read2_v2i32_align1_odd_offset:
+; CI-COUNT-8: ds_read_u8
+
+; GFX9-ALIGNED-COUNT-8: ds_read_u8
+
+; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}}
+; GFX9-UNALIGNED: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_ADDR]] offset1:1{{$}}
+define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) {
+entry:
+  %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
+  store <2 x i32> %load, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 declare void @void_func_void() #3
 
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 6b0ce6391ca892..dce2884d77c3d2 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
@@ -523,6 +524,21 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs
   ret void
 }
 
+@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
+
+; GCN-LABEL: {{^}}write2_v2i32_align1_odd_offset:
+; CI-COUNT-8: ds_write_b8
+
+; GFX9-ALIGNED-COUNT-8: ds_write_b8
+
+; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}}
+; GFX9-UNALIGNED: ds_write2_b32 [[BASE_ADDR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
+entry:
+  store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.y() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1

From e15143d31bca3973db51714af6361f3e77a9e058 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 24 Aug 2020 18:29:57 -0500
Subject: [PATCH 04/19] [Hexagon] Implement llvm.masked.load and
 llvm.masked.store for HVX

---
 llvm/lib/Target/Hexagon/HexagonISelLowering.h |   3 +-
 .../Target/Hexagon/HexagonISelLoweringHVX.cpp | 169 ++++++++++++++++--
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp  |   2 +
 llvm/lib/Target/Hexagon/HexagonPatternsHVX.td |   8 +
 .../Hexagon/HexagonTargetTransformInfo.cpp    |  14 +-
 .../Hexagon/HexagonTargetTransformInfo.h      |   3 +
 .../Hexagon/autohvx/masked-vmem-basic.ll      |  35 ++++
 .../test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll |   2 +-
 .../test/CodeGen/Hexagon/store-vector-pred.ll |   2 +-
 9 files changed, 215 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 7aee7df917b462..ccea0da46e0d5f 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -469,8 +469,7 @@ namespace HexagonISD {
     SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const;
-    SDValue HvxVecPredBitcastComputation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
     SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 7de7d414bd8074..6e0733775ec4a9 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -97,6 +97,8 @@ HexagonTargetLowering::initializeHVXLowering() {
 
     setOperationAction(ISD::CTTZ,               T, Custom);
     setOperationAction(ISD::LOAD,               T, Custom);
+    setOperationAction(ISD::MLOAD,              T, Custom);
+    setOperationAction(ISD::MSTORE,             T, Custom);
     setOperationAction(ISD::MUL,                T, Custom);
     setOperationAction(ISD::MULHS,              T, Custom);
     setOperationAction(ISD::MULHU,              T, Custom);
@@ -150,6 +152,8 @@ HexagonTargetLowering::initializeHVXLowering() {
 
     setOperationAction(ISD::LOAD,     T, Custom);
     setOperationAction(ISD::STORE,    T, Custom);
+    setOperationAction(ISD::MLOAD,    T, Custom);
+    setOperationAction(ISD::MSTORE,   T, Custom);
     setOperationAction(ISD::CTLZ,     T, Custom);
     setOperationAction(ISD::CTTZ,     T, Custom);
     setOperationAction(ISD::CTPOP,    T, Custom);
@@ -188,6 +192,9 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::AND,                BoolW, Custom);
     setOperationAction(ISD::OR,                 BoolW, Custom);
     setOperationAction(ISD::XOR,                BoolW, Custom);
+    // Masked load/store takes a mask that may need splitting.
+    setOperationAction(ISD::MLOAD,              BoolW, Custom);
+    setOperationAction(ISD::MSTORE,             BoolW, Custom);
   }
 
   for (MVT T : LegalV) {
@@ -1593,7 +1600,7 @@ HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
-  const SDLoc &dl(Op);
+      const SDLoc &dl(Op);
   MVT ResTy = ty(Op);
 
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -1613,6 +1620,75 @@ HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
+SDValue
+HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned HwLen = Subtarget.getVectorLength();
+  auto *MaskN = cast<MaskedLoadStoreSDNode>(Op.getNode());
+  SDValue Mask = MaskN->getMask();
+  SDValue Chain = MaskN->getChain();
+  SDValue Base = MaskN->getBasePtr();
+  auto *MemOp = MaskN->getMemOperand();
+
+  unsigned Opc = Op->getOpcode();
+  assert(Opc == ISD::MLOAD || Opc == ISD::MSTORE);
+
+  if (Opc == ISD::MLOAD) {
+    MVT ValTy = ty(Op);
+    SDValue Load = DAG.getLoad(ValTy, dl, Chain, Base, MaskN->getMemOperand());
+    SDValue Thru = cast<MaskedLoadSDNode>(MaskN)->getPassThru();
+    if (isUndef(Thru))
+      return Load;
+    SDValue VSel = DAG.getNode(ISD::VSELECT, dl, ValTy, Mask, Load, Thru);
+    return DAG.getMergeValues({VSel, Load.getValue(1)}, dl);
+  }
+
+  // MSTORE
+  // HVX only has aligned masked stores.
+
+  // TODO: Fold negations of the mask into the store.
+  unsigned StoreOpc = Hexagon::V6_vS32b_qpred_ai;
+  SDValue Value = cast<MaskedStoreSDNode>(MaskN)->getValue();
+  SDValue Offset0 = DAG.getTargetConstant(0, dl, ty(Base));
+
+  if (MaskN->getAlign().value() % HwLen == 0) {
+    SDValue Store = getInstr(StoreOpc, dl, MVT::Other,
+                             {Mask, Base, Offset0, Value, Chain}, DAG);
+    DAG.setNodeMemRefs(cast<MachineSDNode>(Store.getNode()), {MemOp});
+    return Store;
+  }
+
+  // Unaligned case.
+  auto StoreAlign = [&](SDValue V, SDValue A) {
+    SDValue Z = getZero(dl, ty(V), DAG);
+    // TODO: use funnel shifts?
+    // vlalign(Vu,Vv,Rt) rotates the pair Vu:Vv left by Rt and takes the
+    // upper half.
+    SDValue LoV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {V, Z, A}, DAG);
+    SDValue HiV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {Z, V, A}, DAG);
+    return std::make_pair(LoV, HiV);
+  };
+
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+  SDValue MaskV = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, Mask);
+  VectorPair Tmp = StoreAlign(MaskV, Base);
+  VectorPair MaskU = {DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.first),
+                      DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.second)};
+  VectorPair ValueU = StoreAlign(Value, Base);
+
+  SDValue Offset1 = DAG.getTargetConstant(HwLen, dl, MVT::i32);
+  SDValue StoreLo =
+      getInstr(StoreOpc, dl, MVT::Other,
+               {MaskU.first, Base, Offset0, ValueU.first, Chain}, DAG);
+  SDValue StoreHi =
+      getInstr(StoreOpc, dl, MVT::Other,
+               {MaskU.second, Base, Offset1, ValueU.second, Chain}, DAG);
+  DAG.setNodeMemRefs(cast<MachineSDNode>(StoreLo.getNode()), {MemOp});
+  DAG.setNodeMemRefs(cast<MachineSDNode>(StoreHi.getNode()), {MemOp});
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, {StoreLo, StoreHi});
+}
+
 SDValue
 HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
   assert(!Op.isMachineOpcode());
@@ -1648,45 +1724,81 @@ HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
-  LSBaseSDNode *BN = cast<LSBaseSDNode>(Op.getNode());
-  assert(BN->isUnindexed());
-  MVT MemTy = BN->getMemoryVT().getSimpleVT();
+  auto *MemN = cast<MemSDNode>(Op.getNode());
+
+  MVT MemTy = MemN->getMemoryVT().getSimpleVT();
   if (!isHvxPairTy(MemTy))
     return Op;
 
   const SDLoc &dl(Op);
   unsigned HwLen = Subtarget.getVectorLength();
   MVT SingleTy = typeSplit(MemTy).first;
-  SDValue Chain = BN->getChain();
-  SDValue Base0 = BN->getBasePtr();
+  SDValue Chain = MemN->getChain();
+  SDValue Base0 = MemN->getBasePtr();
   SDValue Base1 = DAG.getMemBasePlusOffset(Base0, TypeSize::Fixed(HwLen), dl);
 
   MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr;
-  if (MachineMemOperand *MMO = BN->getMemOperand()) {
+  if (MachineMemOperand *MMO = MemN->getMemOperand()) {
     MachineFunction &MF = DAG.getMachineFunction();
     MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen);
     MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen);
   }
 
-  unsigned MemOpc = BN->getOpcode();
-  SDValue NewOp;
+  unsigned MemOpc = MemN->getOpcode();
 
   if (MemOpc == ISD::LOAD) {
+    assert(cast<LoadSDNode>(Op)->isUnindexed());
     SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0);
     SDValue Load1 = DAG.getLoad(SingleTy, dl, Chain, Base1, MOp1);
-    NewOp = DAG.getMergeValues(
-              { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
-                DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            Load0.getValue(1), Load1.getValue(1)) }, dl);
-  } else {
-    assert(MemOpc == ISD::STORE);
+    return DAG.getMergeValues(
+        { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      Load0.getValue(1), Load1.getValue(1)) }, dl);
+  }
+  if (MemOpc == ISD::STORE) {
+    assert(cast<StoreSDNode>(Op)->isUnindexed());
     VectorPair Vals = opSplit(cast<StoreSDNode>(Op)->getValue(), dl, DAG);
     SDValue Store0 = DAG.getStore(Chain, dl, Vals.first, Base0, MOp0);
     SDValue Store1 = DAG.getStore(Chain, dl, Vals.second, Base1, MOp1);
-    NewOp = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
+  }
+
+  assert(MemOpc == ISD::MLOAD || MemOpc == ISD::MSTORE);
+
+  auto MaskN = cast<MaskedLoadStoreSDNode>(Op);
+  assert(MaskN->isUnindexed());
+  VectorPair Masks = opSplit(MaskN->getMask(), dl, DAG);
+  SDValue Offset = DAG.getUNDEF(MVT::i32);
+
+  if (MemOpc == ISD::MLOAD) {
+    VectorPair Thru =
+        opSplit(cast<MaskedLoadSDNode>(Op)->getPassThru(), dl, DAG);
+    SDValue MLoad0 =
+        DAG.getMaskedLoad(SingleTy, dl, Chain, Base0, Offset, Masks.first,
+                          Thru.first, SingleTy, MOp0, ISD::UNINDEXED,
+                          ISD::NON_EXTLOAD, false);
+    SDValue MLoad1 =
+        DAG.getMaskedLoad(SingleTy, dl, Chain, Base1, Offset, Masks.second,
+                          Thru.second, SingleTy, MOp1, ISD::UNINDEXED,
+                          ISD::NON_EXTLOAD, false);
+    return DAG.getMergeValues(
+        { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, MLoad0, MLoad1),
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      MLoad0.getValue(1), MLoad1.getValue(1)) }, dl);
+  }
+  if (MemOpc == ISD::MSTORE) {
+    VectorPair Vals = opSplit(cast<MaskedStoreSDNode>(Op)->getValue(), dl, DAG);
+    SDValue MStore0 = DAG.getMaskedStore(Chain, dl, Vals.first, Base0, Offset,
+                                         Masks.first, SingleTy, MOp0,
+                                         ISD::UNINDEXED, false, false);
+    SDValue MStore1 = DAG.getMaskedStore(Chain, dl, Vals.second, Base1, Offset,
+                                         Masks.second, SingleTy, MOp1,
+                                         ISD::UNINDEXED, false, false);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MStore0, MStore1);
   }
 
-  return NewOp;
+  std::string Name = "Unexpected operation: " + Op->getOperationName(&DAG);
+  llvm_unreachable(Name.c_str());
 }
 
 SDValue
@@ -1749,6 +1861,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::SETCC:
     case ISD::INTRINSIC_VOID:          return Op;
     case ISD::INTRINSIC_WO_CHAIN:      return LowerHvxIntrinsic(Op, DAG);
+    case ISD::MLOAD:
+    case ISD::MSTORE:                  return LowerHvxMaskedOp(Op, DAG);
     // Unaligned loads will be handled by the default lowering.
     case ISD::LOAD:                    return SDValue();
   }
@@ -1761,6 +1875,25 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
 void
 HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
       SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  unsigned Opc = N->getOpcode();
+  SDValue Op(N, 0);
+
+  switch (Opc) {
+    case ISD::MLOAD:
+      if (isHvxPairTy(ty(Op))) {
+        SDValue S = SplitHvxMemOp(Op, DAG);
+        assert(S->getOpcode() == ISD::MERGE_VALUES);
+        Results.push_back(S.getOperand(0));
+        Results.push_back(S.getOperand(1));
+      }
+      break;
+    case ISD::MSTORE:
+      if (isHvxPairTy(ty(Op->getOperand(1)))) {    // Stored value
+        SDValue S = SplitHvxMemOp(Op, DAG);
+        Results.push_back(S);
+      }
+      break;
+  }
 }
 
 void
@@ -1783,6 +1916,8 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
 SDValue
 HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
       const {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
   const SDLoc &dl(N);
   SDValue Op(N, 0);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index d1cd23c3be3e50..93215a4b61870b 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -2721,6 +2721,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::PS_vloadrw_nt_ai:
   case Hexagon::V6_vL32b_ai:
   case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vS32b_qpred_ai:
+  case Hexagon::V6_vS32b_nqpred_ai:
   case Hexagon::V6_vL32b_nt_ai:
   case Hexagon::V6_vS32b_nt_ai:
   case Hexagon::V6_vL32Ub_ai:
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index 078a7135c55be4..0e5772bd690f25 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -364,6 +364,14 @@ let Predicates = [UseHVX] in {
              (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>;
   }
 
+  // Take a pair of vectors Vt:Vs and shift them towards LSB by (Rt & HwLen).
+  def: Pat<(VecI8 (valign HVI8:$Vt, HVI8:$Vs, I32:$Rt)),
+           (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+  def: Pat<(VecI16 (valign HVI16:$Vt, HVI16:$Vs, I32:$Rt)),
+           (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+  def: Pat<(VecI32 (valign HVI32:$Vt, HVI32:$Vs, I32:$Rt)),
+           (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+
   def: Pat<(HexagonVASL HVI8:$Vs, I32:$Rt),
            (V6_vpackeb (V6_vaslh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt),
                        (V6_vaslh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index ce674d638ccb44..cbd60f36d8c6ec 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -35,6 +35,9 @@ static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
   cl::init(true), cl::Hidden,
   cl::desc("Control lookup table emission on Hexagon target"));
 
+static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem", cl::init(true),
+  cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
+
 // Constant "cost factor" to make floating point operations more expensive
 // in terms of vectorization cost. This isn't the best way, but it should
 // do. Ultimately, the cost should use cycles.
@@ -45,8 +48,7 @@ bool HexagonTTIImpl::useHVX() const {
 }
 
 bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
-  assert(VecTy->isVectorTy());
-  if (isa<ScalableVectorType>(VecTy))
+  if (!VecTy->isVectorTy() || isa<ScalableVectorType>(VecTy))
     return false;
   // Avoid types like <2 x i32*>.
   if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
@@ -308,6 +310,14 @@ unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return 1;
 }
 
+bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) {
+  return HexagonMaskedVMem && isTypeForHVX(DataType);
+}
+
+bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) {
+  return HexagonMaskedVMem && isTypeForHVX(DataType);
+}
+
 /// --- Vector TTI end ---
 
 unsigned HexagonTTIImpl::getPrefetchDistance() const {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 07e59fb5585e80..b99f512df76651 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -155,6 +155,9 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
     return 1;
   }
 
+  bool isLegalMaskedStore(Type *DataType, Align Alignment);
+  bool isLegalMaskedLoad(Type *DataType, Align Alignment);
+
   /// @}
 
   int getUserCost(const User *U, ArrayRef<const Value *> Operands,
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll b/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll
new file mode 100644
index 00000000000000..9836d2d5cb5cab
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; CHECK-LABEL: f0:
+; CHECK: vmemu
+; CHECK: vmux
+define <128 x i8> @f0(<128 x i8>* %a0, i32 %a1, i32 %a2) #0 {
+  %q0 = call <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32 %a2)
+  %v0 = call <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32 %a1)
+  %v1 = bitcast <32 x i32> %v0 to <128 x i8>
+  %v2 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %q0, <128 x i8> %v1)
+  ret <128 x i8> %v2
+}
+
+; CHECK-LABEL: f1:
+; CHECK: vlalign
+; CHECK: if (q{{.}}) vmem{{.*}} = v
+define void @f1(<128 x i8>* %a0, i32 %a1, i32 %a2) #0 {
+  %q0 = call <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32 %a2)
+  %v0 = call <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32 %a1)
+  %v1 = bitcast <32 x i32> %v0 to <128 x i8>
+  call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %v1, <128 x i8>* %a0, i32 4, <128 x i1> %q0)
+  ret void
+}
+
+declare <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32) #1
+declare <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32) #1
+declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32 immarg, <128 x i1>, <128 x i8>) #2
+declare void @llvm.masked.store.v128i8.p0v128i8(<128 x i8>, <128 x i8>*, i32 immarg, <128 x i1>) #2
+
+attributes #0 = { "target-cpu"="hexagonv65" "target-features"="+hvx,+hvx-length128b" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { argmemonly nounwind readonly willreturn }
+attributes #3 = { argmemonly nounwind willreturn }
+
+
diff --git a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
index c44e7a863840ed..cb135f72448fe6 100644
--- a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
+++ b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -hexagon-instsimplify=0  < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-instsimplify=0 -hexagon-masked-vmem=0 < %s | FileCheck %s
 
 ; Test that LLVM does not assert and bitcast v64i1 to i64 is lowered
 ; without crashing.
diff --git a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll
index a177f87ddfbd56..d9d841cacc5bbe 100644
--- a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll
+++ b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -hexagon-instsimplify=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-instsimplify=0 -hexagon-masked-vmem=0 < %s | FileCheck %s
 
 ; This test checks that store a vector predicate of type v128i1 is lowered
 ; without crashing.

From 19e883fc59887e98a49ec03557ad2b6bc5537e03 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Wed, 26 Aug 2020 10:06:28 -0700
Subject: [PATCH 05/19] [SVE] Remove calls to VectorType::getNumElements from
 clang

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D82582
---
 clang/lib/CodeGen/CGAtomic.cpp         |   2 +-
 clang/lib/CodeGen/CGBuiltin.cpp        | 150 +++++++++++++------------
 clang/lib/CodeGen/CGExpr.cpp           |  11 +-
 clang/lib/CodeGen/CGExprScalar.cpp     |  31 ++---
 clang/lib/CodeGen/SwiftCallingConv.cpp |  16 ++-
 5 files changed, 114 insertions(+), 96 deletions(-)

diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index d7720a23dd7206..a640cb7b5a6ecb 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -119,7 +119,7 @@ namespace {
         ValueTy = lvalue.getType();
         ValueSizeInBits = C.getTypeSize(ValueTy);
         AtomicTy = ValueTy = CGF.getContext().getExtVectorType(
-            lvalue.getType(), cast<llvm::VectorType>(
+            lvalue.getType(), cast<llvm::FixedVectorType>(
                                   lvalue.getExtVectorAddress().getElementType())
                                   ->getNumElements());
         AtomicSizeInBits = C.getTypeSize(AtomicTy);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d002624cef4d24..24e33c164009d9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4561,11 +4561,11 @@ Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
                                    getTarget().getTriple().getArch());
 }
 
-static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
-                                     NeonTypeFlags TypeFlags,
-                                     bool HasLegalHalfType = true,
-                                     bool V1Ty = false,
-                                     bool AllowBFloatArgsAndRet = true) {
+static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
+                                          NeonTypeFlags TypeFlags,
+                                          bool HasLegalHalfType = true,
+                                          bool V1Ty = false,
+                                          bool AllowBFloatArgsAndRet = true) {
   int IsQuad = TypeFlags.isQuad();
   switch (TypeFlags.getEltType()) {
   case NeonTypeFlags::Int8:
@@ -5621,8 +5621,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   const bool AllowBFloatArgsAndRet =
       getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
 
-  llvm::VectorType *VTy = GetNeonType(this, Type, HasLegalHalfType, false,
-                                      AllowBFloatArgsAndRet);
+  llvm::FixedVectorType *VTy =
+      GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet);
   llvm::Type *Ty = VTy;
   if (!Ty)
     return nullptr;
@@ -5663,8 +5663,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
   case NEON::BI__builtin_neon_vaddhn_v: {
-    llvm::VectorType *SrcTy =
-        llvm::VectorType::getExtendedElementVectorType(VTy);
+    llvm::FixedVectorType *SrcTy =
+        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
 
     // %sum = add <4 x i32> %lhs, %rhs
     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
@@ -5936,14 +5936,16 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
   case NEON::BI__builtin_neon_vmovl_v: {
-    llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy);
+    llvm::FixedVectorType *DTy =
+        llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
     if (Usgn)
       return Builder.CreateZExt(Ops[0], Ty, "vmovl");
     return Builder.CreateSExt(Ops[0], Ty, "vmovl");
   }
   case NEON::BI__builtin_neon_vmovn_v: {
-    llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy);
+    llvm::FixedVectorType *QTy =
+        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
   }
@@ -5989,7 +5991,7 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vqdmulh_lane_v:
   case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
   case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
-    auto *RTy = cast<llvm::VectorType>(Ty);
+    auto *RTy = cast<llvm::FixedVectorType>(Ty);
     if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
         BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
       RTy = llvm::FixedVectorType::get(RTy->getElementType(),
@@ -6038,7 +6040,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
                              "vshl_n");
   case NEON::BI__builtin_neon_vshll_n_v: {
-    llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy);
+    llvm::FixedVectorType *SrcTy =
+        llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
     if (Usgn)
       Ops[0] = Builder.CreateZExt(Ops[0], VTy);
@@ -6048,7 +6051,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
   }
   case NEON::BI__builtin_neon_vshrn_n_v: {
-    llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy);
+    llvm::FixedVectorType *SrcTy =
+        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
     Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
     if (Usgn)
@@ -6097,8 +6101,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
   }
   case NEON::BI__builtin_neon_vsubhn_v: {
-    llvm::VectorType *SrcTy =
-        llvm::VectorType::getExtendedElementVectorType(VTy);
+    llvm::FixedVectorType *SrcTy =
+        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
 
     // %sum = add <4 x i32> %lhs, %rhs
     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
@@ -6310,7 +6314,7 @@ static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
 
   // Build a vector containing sequential number like (0, 1, 2, ..., 15)
   SmallVector<int, 16> Indices;
-  llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
+  auto *TblTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
     Indices.push_back(2*i);
     Indices.push_back(2*i+1);
@@ -7151,10 +7155,9 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
   bool usgn = Type.isUnsigned();
   bool rightShift = false;
 
-  llvm::VectorType *VTy = GetNeonType(this, Type,
-                                      getTarget().hasLegalHalfType(),
-                                      false,
-                                      getTarget().hasBFloat16Type());
+  llvm::FixedVectorType *VTy =
+      GetNeonType(this, Type, getTarget().hasLegalHalfType(), false,
+                  getTarget().hasBFloat16Type());
   llvm::Type *Ty = VTy;
   if (!Ty)
     return nullptr;
@@ -7362,7 +7365,7 @@ static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd)
   // or odds, as desired).
   SmallVector<int, 16> Indices;
   unsigned InputElements =
-      cast<llvm::VectorType>(V->getType())->getNumElements();
+      cast<llvm::FixedVectorType>(V->getType())->getNumElements();
   for (unsigned i = 0; i < InputElements; i += 2)
     Indices.push_back(i + Odd);
   return Builder.CreateShuffleVector(V, llvm::UndefValue::get(V->getType()),
@@ -7375,7 +7378,7 @@ static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
   assert(V0->getType() == V1->getType() && "Can't zip different vector types");
   SmallVector<int, 16> Indices;
   unsigned InputElements =
-      cast<llvm::VectorType>(V0->getType())->getNumElements();
+      cast<llvm::FixedVectorType>(V0->getType())->getNumElements();
   for (unsigned i = 0; i < InputElements; i++) {
     Indices.push_back(i);
     Indices.push_back(i + InputElements);
@@ -7571,7 +7574,7 @@ static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID
 
   // Determine the type of this overloaded NEON intrinsic.
   NeonTypeFlags Type = Result->getZExtValue();
-  llvm::VectorType *Ty = GetNeonType(&CGF, Type);
+  llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type);
   if (!Ty)
     return nullptr;
 
@@ -9773,7 +9776,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   }
   }
 
-  llvm::VectorType *VTy = GetNeonType(this, Type);
+  llvm::FixedVectorType *VTy = GetNeonType(this, Type);
   llvm::Type *Ty = VTy;
   if (!Ty)
     return nullptr;
@@ -9834,13 +9837,13 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
   }
   case NEON::BI__builtin_neon_vfma_laneq_v: {
-    llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
+    auto *VTy = cast<llvm::FixedVectorType>(Ty);
     // v1f64 fma should be mapped to Neon scalar f64 fma
     if (VTy && VTy->getElementType() == DoubleTy) {
       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
-      llvm::Type *VTy = GetNeonType(this,
-        NeonTypeFlags(NeonTypeFlags::Float64, false, true));
+      llvm::FixedVectorType *VTy =
+          GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true));
       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
       Value *Result;
@@ -10208,8 +10211,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
       Quad = true;
     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
-    llvm::Type *VTy = GetNeonType(this,
-      NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
+    llvm::FixedVectorType *VTy =
+        GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
@@ -11081,7 +11084,8 @@ static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
                                llvm::PointerType::getUnqual(Ops[1]->getType()));
 
   Value *MaskVec = getMaskVecValue(
-      CGF, Ops[2], cast<llvm::VectorType>(Ops[1]->getType())->getNumElements());
+      CGF, Ops[2],
+      cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
 
   return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec);
 }
@@ -11093,7 +11097,8 @@ static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
                                llvm::PointerType::getUnqual(Ops[1]->getType()));
 
   Value *MaskVec = getMaskVecValue(
-      CGF, Ops[2], cast<llvm::VectorType>(Ops[1]->getType())->getNumElements());
+      CGF, Ops[2],
+      cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
 
   return CGF.Builder.CreateMaskedLoad(Ptr, Alignment, MaskVec, Ops[1]);
 }
@@ -11107,7 +11112,8 @@ static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
                                          llvm::PointerType::getUnqual(PtrTy));
 
-  Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
+  Value *MaskVec = getMaskVecValue(
+      CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements());
 
   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
                                            ResultTy);
@@ -11117,7 +11123,7 @@ static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
 static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
                                     ArrayRef<Value *> Ops,
                                     bool IsCompress) {
-  auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
+  auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
 
   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
 
@@ -11129,7 +11135,7 @@ static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
 
 static Value *EmitX86CompressStore(CodeGenFunction &CGF,
                                    ArrayRef<Value *> Ops) {
-  auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
+  auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
   llvm::Type *PtrTy = ResultTy->getElementType();
 
   // Cast the pointer to element type.
@@ -11165,7 +11171,7 @@ static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1,
   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
   // we only care about the lowest log2 bits anyway.
   if (Amt->getType() != Ty) {
-    unsigned NumElts = cast<llvm::VectorType>(Ty)->getNumElements();
+    unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements();
     Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
     Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt);
   }
@@ -11224,7 +11230,7 @@ static Value *EmitX86Select(CodeGenFunction &CGF,
       return Op0;
 
   Mask = getMaskVecValue(
-      CGF, Mask, cast<llvm::VectorType>(Op0->getType())->getNumElements());
+      CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements());
 
   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
 }
@@ -11271,7 +11277,7 @@ static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
   assert((Ops.size() == 2 || Ops.size() == 4) &&
          "Unexpected number of arguments");
   unsigned NumElts =
-      cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+      cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
   Value *Cmp;
 
   if (CC == 3) {
@@ -11548,7 +11554,8 @@ static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
 
 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
                               llvm::Type *DstTy) {
-  unsigned NumberOfElements = cast<llvm::VectorType>(DstTy)->getNumElements();
+  unsigned NumberOfElements =
+      cast<llvm::FixedVectorType>(DstTy)->getNumElements();
   Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
   return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
 }
@@ -11584,11 +11591,12 @@ static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
     return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]});
   }
 
-  unsigned NumDstElts = cast<llvm::VectorType>(DstTy)->getNumElements();
+  unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
   Value *Src = Ops[0];
 
   // Extract the subvector.
-  if (NumDstElts != cast<llvm::VectorType>(Src->getType())->getNumElements()) {
+  if (NumDstElts !=
+      cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) {
     assert(NumDstElts == 4 && "Unexpected vector size");
     Src = CGF.Builder.CreateShuffleVector(Src, UndefValue::get(Src->getType()),
                                           ArrayRef<int>{0, 1, 2, 3});
@@ -11887,7 +11895,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vec_ext_v8si:
   case X86::BI__builtin_ia32_vec_ext_v4di: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
     Index &= NumElts - 1;
     // These builtins exist so we can ensure the index is an ICE and in range.
@@ -11903,7 +11911,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vec_set_v8si:
   case X86::BI__builtin_ia32_vec_set_v4di: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
     Index &= NumElts - 1;
     // These builtins exist so we can ensure the index is an ICE and in range.
@@ -12329,9 +12337,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
       break;
     }
 
-    unsigned MinElts =
-        std::min(cast<llvm::VectorType>(Ops[0]->getType())->getNumElements(),
-                 cast<llvm::VectorType>(Ops[2]->getType())->getNumElements());
+    unsigned MinElts = std::min(
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(),
+        cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements());
     Ops[3] = getMaskVecValue(*this, Ops[3], MinElts);
     Function *Intr = CGM.getIntrinsic(IID);
     return Builder.CreateCall(Intr, Ops);
@@ -12438,9 +12446,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
       break;
     }
 
-    unsigned MinElts =
-        std::min(cast<llvm::VectorType>(Ops[2]->getType())->getNumElements(),
-                 cast<llvm::VectorType>(Ops[3]->getType())->getNumElements());
+    unsigned MinElts = std::min(
+        cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(),
+        cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements());
     Ops[1] = getMaskVecValue(*this, Ops[1], MinElts);
     Function *Intr = CGM.getIntrinsic(IID);
     return Builder.CreateCall(Intr, Ops);
@@ -12462,10 +12470,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_extracti64x2_256_mask:
   case X86::BI__builtin_ia32_extractf64x2_512_mask:
   case X86::BI__builtin_ia32_extracti64x2_512_mask: {
-    auto *DstTy = cast<llvm::VectorType>(ConvertType(E->getType()));
+    auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType()));
     unsigned NumElts = DstTy->getNumElements();
     unsigned SrcNumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned SubVectors = SrcNumElts / NumElts;
     unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
@@ -12503,9 +12511,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_insertf64x2_512:
   case X86::BI__builtin_ia32_inserti64x2_512: {
     unsigned DstNumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned SrcNumElts =
-        cast<llvm::VectorType>(Ops[1]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements();
     unsigned SubVectors = DstNumElts / SrcNumElts;
     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
@@ -12570,7 +12578,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_pblendd128:
   case X86::BI__builtin_ia32_pblendd256: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
 
     int Indices[16];
@@ -12587,7 +12595,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512: {
     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
 
     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
@@ -12611,7 +12619,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512: {
     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
 
     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
@@ -12641,7 +12649,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vpermilpd512:
   case X86::BI__builtin_ia32_vpermilps512: {
     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
     unsigned NumLaneElts = NumElts / NumLanes;
@@ -12668,7 +12676,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_shufps256:
   case X86::BI__builtin_ia32_shufps512: {
     uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
     unsigned NumLaneElts = NumElts / NumLanes;
@@ -12696,7 +12704,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_permdi512:
   case X86::BI__builtin_ia32_permdf512: {
     unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
 
     // These intrinsics operate on 256-bit lanes of four 64-bit elements.
@@ -12715,7 +12723,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
 
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     assert(NumElts % 16 == 0);
 
     // If palignr is shifting the pair of vectors more than the size of two
@@ -12753,7 +12761,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_alignq256:
   case X86::BI__builtin_ia32_alignq512: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
 
     // Mask the shift amount to width of two vectors.
@@ -12776,7 +12784,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_shuf_i32x4:
   case X86::BI__builtin_ia32_shuf_i64x2: {
     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
     unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
     unsigned NumLaneElts = NumElts / NumLanes;
@@ -12803,7 +12811,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_permti256: {
     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
 
     // This takes a very simple approach since there are two lanes and a
     // shuffle can have 2 inputs. So we reserve the first input for the first
@@ -12841,7 +12849,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_pslldqi256_byteshift:
   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
-    auto *ResultType = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
     // Builtin type is vXi64 so multiply by 8 to get bytes.
     unsigned NumElts = ResultType->getNumElements() * 8;
 
@@ -12871,7 +12879,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_psrldqi256_byteshift:
   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
-    auto *ResultType = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
     // Builtin type is vXi64 so multiply by 8 to get bytes.
     unsigned NumElts = ResultType->getNumElements() * 8;
 
@@ -13518,7 +13526,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_fpclasspd256_mask:
   case X86::BI__builtin_ia32_fpclasspd512_mask: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     Value *MaskIn = Ops[2];
     Ops.erase(&Ops[2]);
 
@@ -13556,7 +13564,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vp2intersect_d_256:
   case X86::BI__builtin_ia32_vp2intersect_d_128: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     Intrinsic::ID ID;
 
     switch (BuiltinID) {
@@ -13615,7 +13623,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
   case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     Value *MaskIn = Ops[2];
     Ops.erase(&Ops[2]);
 
@@ -13762,7 +13770,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
       Function *Intr = CGM.getIntrinsic(IID);
       if (IsMaskFCmp) {
         unsigned NumElts =
-            cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+            cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
         Ops[3] = getMaskVecValue(*this, Ops[3], NumElts);
         Value *Cmp = Builder.CreateCall(Intr, Ops);
         return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr);
@@ -13777,7 +13785,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
       // We ignore SAE if strict FP is disabled. We only keep precise
       // exception behavior under strict FP.
       unsigned NumElts =
-          cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+          cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
       Value *Cmp;
       if (IsSignaling)
         Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
@@ -13835,7 +13843,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
     Ops[2] = getMaskVecValue(
         *this, Ops[2],
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements());
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements());
     Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
     return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
   }
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 5d74d91065f56e..7351926035e64d 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1685,7 +1685,7 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile,
     if (Ty->isVectorType()) {
       const llvm::Type *EltTy = Addr.getElementType();
 
-      const auto *VTy = cast<llvm::VectorType>(EltTy);
+      const auto *VTy = cast<llvm::FixedVectorType>(EltTy);
 
       // Handle vectors of size 3 like size 4 for better performance.
       if (VTy->getNumElements() == 3) {
@@ -1770,8 +1770,9 @@ static Address MaybeConvertMatrixAddress(Address Addr, CodeGenFunction &CGF,
   auto *VectorTy = dyn_cast<llvm::VectorType>(
       cast<llvm::PointerType>(Addr.getPointer()->getType())->getElementType());
   if (VectorTy && !IsVector) {
-    auto *ArrayTy = llvm::ArrayType::get(VectorTy->getElementType(),
-                                         VectorTy->getNumElements());
+    auto *ArrayTy = llvm::ArrayType::get(
+        VectorTy->getElementType(),
+        cast<llvm::FixedVectorType>(VectorTy)->getNumElements());
 
     return Address(CGF.Builder.CreateElementBitCast(Addr, ArrayTy));
   }
@@ -1802,7 +1803,7 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
       llvm::Type *SrcTy = Value->getType();
       auto *VecTy = dyn_cast<llvm::VectorType>(SrcTy);
       // Handle vec3 special.
-      if (VecTy && VecTy->getNumElements() == 3) {
+      if (VecTy && cast<llvm::FixedVectorType>(VecTy)->getNumElements() == 3) {
         // Our source is a vec3, do a shuffle vector to make it a vec4.
         Value = Builder.CreateShuffleVector(Value, llvm::UndefValue::get(VecTy),
                                             ArrayRef<int>{0, 1, 2, -1},
@@ -2217,7 +2218,7 @@ void CodeGenFunction::EmitStoreThroughExtVectorComponentLValue(RValue Src,
   if (const VectorType *VTy = Dst.getType()->getAs<VectorType>()) {
     unsigned NumSrcElts = VTy->getNumElements();
     unsigned NumDstElts =
-        cast<llvm::VectorType>(Vec->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Vec->getType())->getNumElements();
     if (NumDstElts == NumSrcElts) {
       // Use shuffle vector is the src and destination are the same number of
       // elements and restore the vector mask since it is on the side it will be
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index d0ec50f4e011ef..511c6a66d55554 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1320,7 +1320,7 @@ Value *ScalarExprEmitter::EmitScalarConversion(Value *Src, QualType SrcType,
            "Splatted expr doesn't match with vector element type?");
 
     // Splat the element across to all elements
-    unsigned NumElements = cast<llvm::VectorType>(DstTy)->getNumElements();
+    unsigned NumElements = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
     return Builder.CreateVectorSplat(NumElements, Src, "splat");
   }
 
@@ -1553,12 +1553,12 @@ Value *ScalarExprEmitter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) {
     Value *RHS = CGF.EmitScalarExpr(E->getExpr(1));
     Value *Mask;
 
-    llvm::VectorType *LTy = cast<llvm::VectorType>(LHS->getType());
+    auto *LTy = cast<llvm::FixedVectorType>(LHS->getType());
     unsigned LHSElts = LTy->getNumElements();
 
     Mask = RHS;
 
-    llvm::VectorType *MTy = cast<llvm::VectorType>(Mask->getType());
+    auto *MTy = cast<llvm::FixedVectorType>(Mask->getType());
 
     // Mask off the high bits of each shuffle index.
     Value *MaskBits =
@@ -1763,7 +1763,7 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) {
     return Visit(E->getInit(0));
   }
 
-  unsigned ResElts = VType->getNumElements();
+  unsigned ResElts = cast<llvm::FixedVectorType>(VType)->getNumElements();
 
   // Loop over initializers collecting the Value for each, and remembering
   // whether the source was swizzle (ExtVectorElementExpr).  This will allow
@@ -1787,7 +1787,8 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) {
       if (isa<ExtVectorElementExpr>(IE)) {
         llvm::ExtractElementInst *EI = cast<llvm::ExtractElementInst>(Init);
 
-        if (EI->getVectorOperandType()->getNumElements() == ResElts) {
+        if (cast<llvm::FixedVectorType>(EI->getVectorOperandType())
+                ->getNumElements() == ResElts) {
           llvm::ConstantInt *C = cast<llvm::ConstantInt>(EI->getIndexOperand());
           Value *LHS = nullptr, *RHS = nullptr;
           if (CurIdx == 0) {
@@ -1825,7 +1826,7 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) {
       continue;
     }
 
-    unsigned InitElts = VVT->getNumElements();
+    unsigned InitElts = cast<llvm::FixedVectorType>(VVT)->getNumElements();
 
     // If the initializer is an ExtVecEltExpr (a swizzle), and the swizzle's
     // input is the same width as the vector being constructed, generate an
@@ -1834,7 +1835,7 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) {
     if (isa<ExtVectorElementExpr>(IE)) {
       llvm::ShuffleVectorInst *SVI = cast<llvm::ShuffleVectorInst>(Init);
       Value *SVOp = SVI->getOperand(0);
-      llvm::VectorType *OpTy = cast<llvm::VectorType>(SVOp->getType());
+      auto *OpTy = cast<llvm::FixedVectorType>(SVOp->getType());
 
       if (OpTy->getNumElements() == ResElts) {
         for (unsigned j = 0; j != CurIdx; ++j) {
@@ -2170,7 +2171,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     llvm::Type *DstTy = ConvertType(DestTy);
     Value *Elt = Visit(const_cast<Expr*>(E));
     // Splat the element across to all elements
-    unsigned NumElements = cast<llvm::VectorType>(DstTy)->getNumElements();
+    unsigned NumElements = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
     return Builder.CreateVectorSplat(NumElements, Elt, "splat");
   }
 
@@ -4331,7 +4332,7 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
     llvm::Value *RHS = Visit(rhsExpr);
 
     llvm::Type *condType = ConvertType(condExpr->getType());
-    llvm::VectorType *vecTy = cast<llvm::VectorType>(condType);
+    auto *vecTy = cast<llvm::FixedVectorType>(condType);
 
     unsigned numElem = vecTy->getNumElements();
     llvm::Type *elemType = vecTy->getElementType();
@@ -4534,10 +4535,14 @@ Value *ScalarExprEmitter::VisitAsTypeExpr(AsTypeExpr *E) {
   llvm::Type *DstTy = ConvertType(E->getType());
 
   llvm::Type *SrcTy = Src->getType();
-  unsigned NumElementsSrc = isa<llvm::VectorType>(SrcTy) ?
-    cast<llvm::VectorType>(SrcTy)->getNumElements() : 0;
-  unsigned NumElementsDst = isa<llvm::VectorType>(DstTy) ?
-    cast<llvm::VectorType>(DstTy)->getNumElements() : 0;
+  unsigned NumElementsSrc =
+      isa<llvm::VectorType>(SrcTy)
+          ? cast<llvm::FixedVectorType>(SrcTy)->getNumElements()
+          : 0;
+  unsigned NumElementsDst =
+      isa<llvm::VectorType>(DstTy)
+          ? cast<llvm::FixedVectorType>(DstTy)->getNumElements()
+          : 0;
 
   // Going from vec3 to non-vec3 is a special case and requires a shuffle
   // vector to get a vec4, then a bitcast if the target type is different.
diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
index 3d7421ac2e16c2..cbbe208426f73e 100644
--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
+++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
@@ -320,9 +320,12 @@ void SwiftAggLowering::addEntry(llvm::Type *type,
   // If we have a vector type, split it.
   if (auto vecTy = dyn_cast_or_null<llvm::VectorType>(type)) {
     auto eltTy = vecTy->getElementType();
-    CharUnits eltSize = (end - begin) / vecTy->getNumElements();
+    CharUnits eltSize =
+        (end - begin) / cast<llvm::FixedVectorType>(vecTy)->getNumElements();
     assert(eltSize == getTypeStoreSize(CGM, eltTy));
-    for (unsigned i = 0, e = vecTy->getNumElements(); i != e; ++i) {
+    for (unsigned i = 0,
+                  e = cast<llvm::FixedVectorType>(vecTy)->getNumElements();
+         i != e; ++i) {
       addEntry(eltTy, begin, begin + eltSize);
       begin += eltSize;
     }
@@ -674,8 +677,9 @@ bool swiftcall::isLegalIntegerType(CodeGenModule &CGM,
 
 bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize,
                                   llvm::VectorType *vectorTy) {
-  return isLegalVectorType(CGM, vectorSize, vectorTy->getElementType(),
-                           vectorTy->getNumElements());
+  return isLegalVectorType(
+      CGM, vectorSize, vectorTy->getElementType(),
+      cast<llvm::FixedVectorType>(vectorTy)->getNumElements());
 }
 
 bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize,
@@ -688,7 +692,7 @@ bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize,
 std::pair<llvm::Type*, unsigned>
 swiftcall::splitLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize,
                                 llvm::VectorType *vectorTy) {
-  auto numElts = vectorTy->getNumElements();
+  auto numElts = cast<llvm::FixedVectorType>(vectorTy)->getNumElements();
   auto eltTy = vectorTy->getElementType();
 
   // Try to split the vector type in half.
@@ -710,7 +714,7 @@ void swiftcall::legalizeVectorType(CodeGenModule &CGM, CharUnits origVectorSize,
   }
 
   // Try to split the vector into legal subvectors.
-  auto numElts = origVectorTy->getNumElements();
+  auto numElts = cast<llvm::FixedVectorType>(origVectorTy)->getNumElements();
   auto eltTy = origVectorTy->getElementType();
   assert(numElts != 1);
 

From c971b53b22a5cd43b54bf4773fe3c59ea1b805fb Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Wed, 26 Aug 2020 13:03:18 -0500
Subject: [PATCH 06/19] [Polly] Use llvm::function_ref. NFC.

As suggested by David Blaike at
https://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20200824/822584.html
---
 polly/include/polly/ScopInfo.h  | 2 +-
 polly/lib/Analysis/ScopInfo.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
index b6fcddc6379ae7..5d18582a07edae 100644
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -2319,7 +2319,7 @@ class Scop {
   ///                      ScopBuilder::buildAccessRelations. Therefore, if this
   ///                      method is called before buildAccessRelations, false
   ///                      must be passed.
-  void removeStmts(std::function<bool(ScopStmt &)> ShouldDelete,
+  void removeStmts(function_ref<bool(ScopStmt &)> ShouldDelete,
                    bool AfterHoisting = true);
 
   /// Get an isl string representing the context.
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index fdb06be0054659..a50e1a7f91af44 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -1752,7 +1752,7 @@ void Scop::removeFromStmtMap(ScopStmt &Stmt) {
   }
 }
 
-void Scop::removeStmts(std::function<bool(ScopStmt &)> ShouldDelete,
+void Scop::removeStmts(function_ref<bool(ScopStmt &)> ShouldDelete,
                        bool AfterHoisting) {
   for (auto StmtIt = Stmts.begin(), StmtEnd = Stmts.end(); StmtIt != StmtEnd;) {
     if (!ShouldDelete(*StmtIt)) {

From 6538fff37245921a0983d94c08af7e6cc120b3a9 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Wed, 26 Aug 2020 13:11:41 -0500
Subject: [PATCH 07/19] [Polly] Inline ShoulDelete lambda. NFC.

As suggested by David Blaikie at
ihttps://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20200824/822584.html
---
 polly/lib/Analysis/ScopInfo.cpp  | 53 ++++++++++++++++----------------
 polly/lib/Transform/Simplify.cpp |  5 ++-
 2 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index a50e1a7f91af44..ba462351af57e5 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -1773,40 +1773,39 @@ void Scop::removeStmts(function_ref<bool(ScopStmt &)> ShouldDelete,
 }
 
 void Scop::removeStmtNotInDomainMap() {
-  auto ShouldDelete = [this](ScopStmt &Stmt) -> bool {
+  removeStmts([this](ScopStmt &Stmt) -> bool {
     isl::set Domain = DomainMap.lookup(Stmt.getEntryBlock());
     if (!Domain)
       return true;
     return Domain.is_empty();
-  };
-  removeStmts(ShouldDelete, false);
+  });
 }
 
 void Scop::simplifySCoP(bool AfterHoisting) {
-  auto ShouldDelete = [AfterHoisting](ScopStmt &Stmt) -> bool {
-    // Never delete statements that contain calls to debug functions.
-    if (hasDebugCall(&Stmt))
-      return false;
-
-    bool RemoveStmt = Stmt.isEmpty();
-
-    // Remove read only statements only after invariant load hoisting.
-    if (!RemoveStmt && AfterHoisting) {
-      bool OnlyRead = true;
-      for (MemoryAccess *MA : Stmt) {
-        if (MA->isRead())
-          continue;
-
-        OnlyRead = false;
-        break;
-      }
-
-      RemoveStmt = OnlyRead;
-    }
-    return RemoveStmt;
-  };
-
-  removeStmts(ShouldDelete, AfterHoisting);
+  removeStmts(
+      [AfterHoisting](ScopStmt &Stmt) -> bool {
+        // Never delete statements that contain calls to debug functions.
+        if (hasDebugCall(&Stmt))
+          return false;
+
+        bool RemoveStmt = Stmt.isEmpty();
+
+        // Remove read only statements only after invariant load hoisting.
+        if (!RemoveStmt && AfterHoisting) {
+          bool OnlyRead = true;
+          for (MemoryAccess *MA : Stmt) {
+            if (MA->isRead())
+              continue;
+
+            OnlyRead = false;
+            break;
+          }
+
+          RemoveStmt = OnlyRead;
+        }
+        return RemoveStmt;
+      },
+      AfterHoisting);
 }
 
 InvariantEquivClassTy *Scop::lookupInvariantEquivClass(Value *Val) {
diff --git a/polly/lib/Transform/Simplify.cpp b/polly/lib/Transform/Simplify.cpp
index f3b8bf83efe58a..d699aa4f499096 100644
--- a/polly/lib/Transform/Simplify.cpp
+++ b/polly/lib/Transform/Simplify.cpp
@@ -169,12 +169,11 @@ class Simplify : public ScopPass {
   void removeEmptyDomainStmts() {
     size_t NumStmtsBefore = S->getSize();
 
-    auto ShouldDelete = [](ScopStmt &Stmt) -> bool {
+    S->removeStmts([](ScopStmt &Stmt) -> bool {
       auto EffectiveDomain =
           Stmt.getDomain().intersect_params(Stmt.getParent()->getContext());
       return EffectiveDomain.is_empty();
-    };
-    S->removeStmts(ShouldDelete);
+    });
 
     assert(NumStmtsBefore >= S->getSize());
     EmptyDomainsRemoved = NumStmtsBefore - S->getSize();

From 476ca330894bf42feeb6c13547d14c821f6b8e0a Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Wed, 26 Aug 2020 11:17:26 -0700
Subject: [PATCH 08/19] [LTO] Don't apply LTOPostLink module flag during
 writeMergedModule

For `ld64` which uses legacy LTOCodeGenerator, it relies on
writeMergedModule to perform `ld -r` (generates a linked object file).
If all the inputs to `ld -r` is fullLTO bitcode, `ld64` will linked the
bitcode module, internalize all the symbols and write out another
fullLTO bitcode object file. This bitcode file doesn't have all the
bitcode inputs and it should not have LTOPostLink module flag. It will
also cause error when this bitcode object file is linked with other LTO
object file.
Fix the issue by not applying LTOPostLink flag during writeMergedModule
function. The flag should only be added when all the bitcode are linked
and ready to be optimized.

rdar://problem/58462798

Reviewed By: tejohnson

Differential Revision: https://reviews.llvm.org/D84789
---
 llvm/lib/LTO/LTOCodeGenerator.cpp         |  5 +++--
 llvm/test/LTO/ARM/lto-linking-metadata.ll |  6 +++++-
 llvm/tools/llvm-lto/llvm-lto.cpp          | 13 +++++++++++++
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 25ab1404b4e12a..aff1977850b888 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -466,8 +466,6 @@ void LTOCodeGenerator::applyScopeRestrictions() {
 
   internalizeModule(*MergedModule, mustPreserveGV);
 
-  MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
-
   ScopeRestrictionsDone = true;
 }
 
@@ -559,6 +557,9 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   // Mark which symbols can not be internalized
   this->applyScopeRestrictions();
 
+  // Write LTOPostLink flag for passes that require all the modules.
+  MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
+
   // Instantiate the pass manager to organize the passes.
   legacy::PassManager passes;
 
diff --git a/llvm/test/LTO/ARM/lto-linking-metadata.ll b/llvm/test/LTO/ARM/lto-linking-metadata.ll
index ae6f42ff9be820..75b65ac85bed8f 100644
--- a/llvm/test/LTO/ARM/lto-linking-metadata.ll
+++ b/llvm/test/LTO/ARM/lto-linking-metadata.ll
@@ -1,7 +1,8 @@
 ; RUN: opt %s -o %t1.bc
 
-; RUN: llvm-lto %t1.bc -o %t1.save.opt -save-merged-module -O1 --exported-symbol=foo
+; RUN: llvm-lto %t1.bc -o %t1.save.opt -save-linked-module -save-merged-module -O1 --exported-symbol=foo
 ; RUN: llvm-dis < %t1.save.opt.merged.bc | FileCheck %s
+; RUN: llvm-dis < %t1.save.opt.linked.bc | FileCheck %s --check-prefix=CHECK-LINKED
 
 ; RUN: llvm-lto2 run %t1.bc -o %t.out.o -save-temps \
 ; RUN:     -r=%t1.bc,foo,pxl
@@ -17,3 +18,6 @@ entry:
 
 ; CHECK: !llvm.module.flags = !{[[MD_NUM:![0-9]+]]}
 ; CHECK: [[MD_NUM]] = !{i32 1, !"LTOPostLink", i32 1}
+
+; CHECK-LINKED: @foo
+; CHECK-LINKED-NOT: LTOPostLink
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index 0bd9078f2d8cae..d56cd30cca1716 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -181,6 +181,10 @@ static cl::opt<std::string> ThinLTOGeneratedObjectsDir(
     cl::desc("Save ThinLTO generated object files using filenames created in "
              "the given directory."));
 
+static cl::opt<bool> SaveLinkedModuleFile(
+    "save-linked-module", cl::init(false),
+    cl::desc("Write linked LTO module to file before optimize"));
+
 static cl::opt<bool>
     SaveModuleFile("save-merged-module", cl::init(false),
                    cl::desc("Write merged LTO module to file before CodeGen"));
@@ -1029,6 +1033,15 @@ int main(int argc, char **argv) {
     CodeGen.setFileType(FT.getValue());
 
   if (!OutputFilename.empty()) {
+    if (SaveLinkedModuleFile) {
+      std::string ModuleFilename = OutputFilename;
+      ModuleFilename += ".linked.bc";
+      std::string ErrMsg;
+
+      if (!CodeGen.writeMergedModules(ModuleFilename))
+        error("writing linked module failed.");
+    }
+
     if (!CodeGen.optimize(DisableVerify, DisableInline, DisableGVNLoadPRE,
                           DisableLTOVectorization)) {
       // Diagnostic messages should have been printed by the handler.

From 61dfa009579f10e75ef110a80c3e5b657ec5867a Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@arm.com>
Date: Wed, 26 Aug 2020 15:43:56 +0000
Subject: [PATCH 09/19] [MC][SVE] Fix data operand for instruction alias of
 `st1d`.

The version of `st1d` that operates with vector plus immediate
addressing mode uses the alias `st1d { <Zn>.d }, <Pg>, [<Za>.d]` for
rendering `st1d { <Zn>.d }, <Pg>, [<Za>.d, #0]`. The disassembler was
generating `<Zn>.s` instead of `<Zn>.d>`.

Differential Revision: https://reviews.llvm.org/D86633
---
 llvm/lib/Target/AArch64/SVEInstrFormats.td |  2 +-
 llvm/test/MC/AArch64/SVE/st1b.s            | 24 ++++++++++++++++++++++
 llvm/test/MC/AArch64/SVE/st1d.s            | 12 +++++++++++
 llvm/test/MC/AArch64/SVE/st1h.s            | 24 ++++++++++++++++++++++
 llvm/test/MC/AArch64/SVE/st1w.s            | 24 ++++++++++++++++++++++
 5 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 4fd9bcd4494922..d118c11dfb6551 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5487,7 +5487,7 @@ multiclass sve_mem_64b_sst_vi_ptrs<bits<3> opc, string asm,
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
                   (!cast<Instruction>(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
-                  (!cast<Instruction>(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+                  (!cast<Instruction>(NAME # _IMM) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
 
   def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt),
             (!cast<Instruction>(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
diff --git a/llvm/test/MC/AArch64/SVE/st1b.s b/llvm/test/MC/AArch64/SVE/st1b.s
index a6f766bdfd7cc7..40b830709ead48 100644
--- a/llvm/test/MC/AArch64/SVE/st1b.s
+++ b/llvm/test/MC/AArch64/SVE/st1b.s
@@ -168,3 +168,27 @@ st1b    { z31.d }, p7, [z31.d, #31]
 // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe4]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf 5f e4 <unknown>
+
+st1b    { z0.s }, p7, [z0.s, #0]
+// CHECK-INST: st1b    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e4 <unknown>
+
+st1b    { z0.s }, p7, [z0.s]
+// CHECK-INST: st1b    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e4 <unknown>
+
+st1b    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1b    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e4 <unknown>
+
+st1b    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1b    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1d.s b/llvm/test/MC/AArch64/SVE/st1d.s
index ba4a0e5be114b2..a5a19e772b5288 100644
--- a/llvm/test/MC/AArch64/SVE/st1d.s
+++ b/llvm/test/MC/AArch64/SVE/st1d.s
@@ -78,3 +78,15 @@ st1d    { z31.d }, p7, [z31.d, #248]
 // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe5]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf df e5 <unknown>
+
+st1d    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1d    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e5 <unknown>
+
+st1d    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1d    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1h.s b/llvm/test/MC/AArch64/SVE/st1h.s
index cd6c20d83482e5..fe22c52bb9befe 100644
--- a/llvm/test/MC/AArch64/SVE/st1h.s
+++ b/llvm/test/MC/AArch64/SVE/st1h.s
@@ -168,3 +168,27 @@ st1h    { z31.d }, p7, [z31.d, #62]
 // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe4]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf df e4 <unknown>
+
+st1h    { z0.s }, p7, [z0.s, #0]
+// CHECK-INST: st1h    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc e0 e4 <unknown>
+
+st1h    { z0.s }, p7, [z0.s]
+// CHECK-INST: st1h    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc e0 e4 <unknown>
+
+st1h    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1h    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e4 <unknown>
+
+st1h    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1h    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1w.s b/llvm/test/MC/AArch64/SVE/st1w.s
index e20194f5747e9e..5bbcd2e1ea0ff4 100644
--- a/llvm/test/MC/AArch64/SVE/st1w.s
+++ b/llvm/test/MC/AArch64/SVE/st1w.s
@@ -138,3 +138,27 @@ st1w    { z31.d }, p7, [z31.d, #124]
 // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe5]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf 5f e5 <unknown>
+
+st1w    { z0.s }, p7, [z0.s, #0]
+// CHECK-INST: st1w    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e5 <unknown>
+
+st1w    { z0.s }, p7, [z0.s]
+// CHECK-INST: st1w    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e5 <unknown>
+
+st1w    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1w    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e5 <unknown>
+
+st1w    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1w    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e5 <unknown>

From 1446c1801deaf7a38221b45662f2e17fa1d5e8f0 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 26 Aug 2020 11:30:10 -0700
Subject: [PATCH 10/19] [gn build] Manually port ed07e1fe

---
 llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 35c6890efd6da9..d54242da38ccaa 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -213,6 +213,7 @@ write_cmake_config("config") {
       "HAVE_SIGALTSTACK=",
       "HAVE_STRERROR_R=",
       "HAVE_SYSCONF=",
+      "HAVE_SYSEXITS_H=",
       "HAVE_SYS_IOCTL_H=",
       "HAVE_SYS_MMAN_H=",
       "HAVE_SYS_PARAM_H=",
@@ -249,6 +250,7 @@ write_cmake_config("config") {
       "HAVE_SIGALTSTACK=1",
       "HAVE_STRERROR_R=1",
       "HAVE_SYSCONF=1",
+      "HAVE_SYSEXITS_H=1",
       "HAVE_SYS_IOCTL_H=1",
       "HAVE_SYS_MMAN_H=1",
       "HAVE_SYS_PARAM_H=1",

From 098d3f98276de90b6e1468031bd3858615240bb7 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Mon, 10 Aug 2020 12:53:30 -0700
Subject: [PATCH 11/19] [InstSimplify] Simplify to vector constants when
 possible

InstSimplify should do all transformations that ConstProp does, but
one thing that ConstProp does that InstSimplify wouldn't is inline
vector instructions that are constants, e.g. into a ret.

Previously vector instructions wouldn't be inlined in InstSimplify
because llvm::Simplify*Instruction() would return nullptr for specific
instructions, such as vector instructions that were actually constants,
if it couldn't simplify them.

This changes SimplifyInsertElementInst, SimplifyExtractElementInst, and
SimplifyShuffleVectorInst to return a vector constant when possible.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D85946
---
 llvm/lib/Analysis/InstructionSimplify.cpp     |  8 +++---
 .../ConstantFolding/vscale-shufflevector.ll   |  4 +--
 llvm/test/Transforms/InstSimplify/vscale.ll   | 27 +++++++++++++++++++
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 9b5bb37a0ff750..887079445eccfa 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4302,7 +4302,7 @@ Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx,
   auto *ValC = dyn_cast<Constant>(Val);
   auto *IdxC = dyn_cast<Constant>(Idx);
   if (VecC && ValC && IdxC)
-    return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC);
+    return ConstantExpr::getInsertElement(VecC, ValC, IdxC);
 
   // For fixed-length vector, fold into undef if index is out of bounds.
   if (auto *CI = dyn_cast<ConstantInt>(Idx)) {
@@ -4367,7 +4367,7 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx,
   auto *VecVTy = cast<VectorType>(Vec->getType());
   if (auto *CVec = dyn_cast<Constant>(Vec)) {
     if (auto *CIdx = dyn_cast<Constant>(Idx))
-      return ConstantFoldExtractElementInstruction(CVec, CIdx);
+      return ConstantExpr::getExtractElement(CVec, CIdx);
 
     // The index is not relevant if our vector is a splat.
     if (auto *Splat = CVec->getSplatValue())
@@ -4565,8 +4565,8 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
   // If all operands are constant, constant fold the shuffle. This
   // transformation depends on the value of the mask which is not known at
   // compile time for scalable vectors
-  if (!Scalable && Op0Const && Op1Const)
-    return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);
+  if (Op0Const && Op1Const)
+    return ConstantExpr::getShuffleVector(Op0Const, Op1Const, Mask);
 
   // Canonicalization: if only one input vector is constant, it shall be the
   // second one. This transformation depends on the value of the mask which
diff --git a/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll b/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll
index dc3b66e18f8717..9c1f6730122e23 100644
--- a/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll
+++ b/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll
@@ -15,9 +15,7 @@ target triple = "aarch64"
 ; the compiler. It happens to be the case that this will be the result.
 
 ; CHECK-LABEL: define <vscale x 8 x i1> @vscale_version()
-; CHECK-NEXT: %splatter = insertelement <vscale x 8 x i1> undef, i1 true, i32 0
-; CHECK-NEXT: %foo = shufflevector <vscale x 8 x i1> %splatter, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: ret <vscale x 8 x i1> %foo
+; CHECK-NEXT: ret <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> undef, i1 true, i32 0), <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer)
 
 define <vscale x 8 x i1> @vscale_version() {
   %splatter = insertelement <vscale x 8 x i1> undef, i1 true, i32 0
diff --git a/llvm/test/Transforms/InstSimplify/vscale.ll b/llvm/test/Transforms/InstSimplify/vscale.ll
index dd4ca47a52eca4..802cb99001672b 100644
--- a/llvm/test/Transforms/InstSimplify/vscale.ll
+++ b/llvm/test/Transforms/InstSimplify/vscale.ll
@@ -51,6 +51,23 @@ define <vscale x 4 x i32> @insert_extract_element_same_vec_idx_1(<vscale x 4 x i
   ret <vscale x 4 x i32> %r
 }
 
+define <vscale x 4 x i32> @insertelement_inline_to_ret() {
+; CHECK-LABEL: @insertelement_inline_to_ret(
+; CHECK-NEXT:    ret <vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 0)
+;
+  %i = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
+  ret <vscale x 4 x i32> %i
+}
+
+define <vscale x 4 x i32> @insertelement_shufflevector_inline_to_ret() {
+; CHECK-LABEL: @insertelement_shufflevector_inline_to_ret(
+; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+;
+  %i = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
+  %i2 = shufflevector <vscale x 4 x i32> %i, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %i2
+}
+
 ; extractelement
 
 define i32 @extractelement_idx_undef(<vscale x 4 x i32> %a) {
@@ -120,6 +137,16 @@ define <vscale x 2 x i1> @cmp_le_smax_always_true(<vscale x 2 x i64> %x) {
    ret <vscale x 2 x i1> %cmp
 }
 
+define <vscale x 4 x float> @bitcast() {
+; CHECK-LABEL: @bitcast(
+; CHECK-NEXT:    ret <vscale x 4 x float> bitcast (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x float>)
+;
+  %i1 = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
+  %i2 = shufflevector <vscale x 4 x i32> %i1, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %i3 = bitcast <vscale x 4 x i32> %i2 to <vscale x 4 x float>
+  ret <vscale x 4 x float> %i3
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Memory Access and Addressing Operations
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

From ea7b1c79f73d8def5d806ae79dea125d146ac864 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Wed, 26 Aug 2020 11:44:57 -0700
Subject: [PATCH 12/19] Add cmake test support for LLJITWithThinLTOSummaries to
 make sure it's being built and called (and substituted).

---
 llvm/test/CMakeLists.txt | 1 +
 llvm/test/lit.cfg.py     | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index cde80035a09bff..58aa680a54c22a 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -153,6 +153,7 @@ if(LLVM_BUILD_EXAMPLES)
     Kaleidoscope-Ch5
     Kaleidoscope-Ch6
     Kaleidoscope-Ch7
+    LLJITWithThinLTOSummaries
     )
   if (NOT WIN32)
     list(APPEND LLVM_TEST_DEPENDS
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 4502ac58c45ac7..6be94245380ad5 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -168,7 +168,8 @@ def get_asan_rtlib():
     ToolSubst('Kaleidoscope-Ch5', unresolved='ignore'),
     ToolSubst('Kaleidoscope-Ch6', unresolved='ignore'),
     ToolSubst('Kaleidoscope-Ch7', unresolved='ignore'),
-    ToolSubst('Kaleidoscope-Ch8', unresolved='ignore')])
+    ToolSubst('Kaleidoscope-Ch8', unresolved='ignore'),
+    ToolSubst('LLJITWithThinLTOSummaries', unresolved='ignore')])
 
 llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir)
 

From 603a8a60ba444eb7fc77f0b31dd063a7583df2c4 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 27 Aug 2020 03:50:14 +0900
Subject: [PATCH 13/19] [mlir] NFC: fix trivial typos in documents

Reviewed By: mravishankar

Differential Revision: https://reviews.llvm.org/D86563
---
 mlir/docs/CAPI.md                         | 20 ++++++++++----------
 mlir/docs/Dialects/Linalg.md              |  2 +-
 mlir/docs/OpDefinitions.md                |  2 +-
 mlir/docs/Rationale/Rationale.md          |  2 +-
 mlir/docs/SPIRVToLLVMDialectConversion.md | 10 +++++-----
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/mlir/docs/CAPI.md b/mlir/docs/CAPI.md
index 68a28950ebc32e..2ec25d15747c7e 100644
--- a/mlir/docs/CAPI.md
+++ b/mlir/docs/CAPI.md
@@ -45,8 +45,8 @@ for creation functions). For example, `mlirOperationGetNumOperands` inspects an
 
 The *ownership* model is encoded in the naming convention as follows.
 
--   By default, the ownership is not transerred.
--   Functions that tranfer the ownership of the result to the caller can be in
+-   By default, the ownership is not transferred.
+-   Functions that transfer the ownership of the result to the caller can be in
     one of two forms:
     *   functions that create a new object have the name `mlirXCreate<...>`, for
         example, `mlirOperationCreate`;
@@ -127,7 +127,7 @@ stored and perform the copy. There is no guarantee that the pointer supplied to
 the callback points to a null-terminated string, the size argument should be
 used to find the end of the string. The callback may be called multiple times
 with consecutive chunks of the string representation (the printing itself is
-bufferred).
+buffered).
 
 *Rationale*: this approach allows the caller to have full control of the
 allocation and avoid unnecessary allocation and copying inside the printer.
@@ -143,9 +143,9 @@ The API adopts the following patterns for recurrent functionality in MLIR.
 
 An object has an _indexed component_ if it has fields accessible using a
 zero-based contiguous integer index, typically arrays. For example, an
-`MlirBlock` has its arguments as a indexed component. An object may have several
-such components. For example, an `MlirOperation` has attributes, operands,
-regions, results and successors.
+`MlirBlock` has its arguments as an indexed component. An object may have
+several such components. For example, an `MlirOperation` has attributes,
+operands, regions, results and successors.
 
 For indexed components, the following pair of functions is provided.
 
@@ -153,10 +153,10 @@ For indexed components, the following pair of functions is provided.
 -   `MlirY mlirXGet<Y>(MlirX, intptr_t pos)` returns 'pos'-th subobject.
 
 The sizes are accepted and returned as signed pointer-sized integers, i.e.
-`intptr_t`. This typedef is avalable in C99.
+`intptr_t`. This typedef is available in C99.
 
 Note that the name of subobject in the function does not necessarily match the
-type of the subobject. For example, `mlirOperationGetOperand` returns a
+type of the subobject. For example, `mlirOperationGetOperand` returns an
 `MlirValue`.
 
 ### Iterable Components
@@ -190,12 +190,12 @@ for (iter = mlirXGetFirst<Y>(x); !mlirYIsNull(iter);
 
 ### Extensions for Dialect Attributes and Types
 
-Dialect attributes and types can follow the example of standard attrbutes and
+Dialect attributes and types can follow the example of standard attributes and
 types, provided that implementations live in separate directories, i.e.
 `include/mlir-c/<...>Dialect/` and `lib/CAPI/<...>Dialect/`. The core APIs
 provide implementation-private headers in `include/mlir/CAPI/IR` that allow one
 to convert between opaque C structures for core IR components and their C++
 counterparts. `wrap` converts a C++ class into a C structure and `unwrap` does
-the inverse conversion. Once the a C++ object is available, the API
+the inverse conversion. Once the C++ object is available, the API
 implementation should rely on `isa` to implement `mlirXIsAY` and is expected to
 use `cast` inside other API calls.
diff --git a/mlir/docs/Dialects/Linalg.md b/mlir/docs/Dialects/Linalg.md
index edf5eb217799ae..7ae1b73f48a735 100644
--- a/mlir/docs/Dialects/Linalg.md
+++ b/mlir/docs/Dialects/Linalg.md
@@ -268,7 +268,7 @@ to correspond to the operations inside the region: the region can capture
 buffers arbitrarily and write into them. If this conflicts with some parallel
 iterator requirement, this is undefined behavior.
 
-Previous examples already ellaborate compute payloads with an unregistered function `"some_compute"`. The following code snippet shows what the result will be when using a concrete operation `addf`:
+Previous examples already elaborate compute payloads with an unregistered function `"some_compute"`. The following code snippet shows what the result will be when using a concrete operation `addf`:
 ```
 // File name: example3.mlir
 #indexing_maps = [
diff --git a/mlir/docs/OpDefinitions.md b/mlir/docs/OpDefinitions.md
index 0997f297274335..418da6a857dcf0 100644
--- a/mlir/docs/OpDefinitions.md
+++ b/mlir/docs/OpDefinitions.md
@@ -1078,7 +1078,7 @@ to convert between the internal storage and the helper method.
 
 ### Attribute decorators
 
-There are a few important attribute adapters/decorators/modifers that can be
+There are a few important attribute adapters/decorators/modifiers that can be
 applied to ODS attributes to specify common additional properties like
 optionality, default values, etc.:
 
diff --git a/mlir/docs/Rationale/Rationale.md b/mlir/docs/Rationale/Rationale.md
index e67bf9de8684b4..c7953e6859586a 100644
--- a/mlir/docs/Rationale/Rationale.md
+++ b/mlir/docs/Rationale/Rationale.md
@@ -214,7 +214,7 @@ operations.
 We allow `index` types in tensors and memrefs as a code generation strategy has
 to map `index` to an implementation type and hence needs to be able to
 materialize corresponding values. However, the target might lack support for
-`vector` values with the target specfic equivalent of the `index` type.
+`vector` values with the target specific equivalent of the `index` type.
 
 ### Bit width of a non-primitive type and `index` is undefined
 
diff --git a/mlir/docs/SPIRVToLLVMDialectConversion.md b/mlir/docs/SPIRVToLLVMDialectConversion.md
index 799107e033a495..c5a1579305272a 100644
--- a/mlir/docs/SPIRVToLLVMDialectConversion.md
+++ b/mlir/docs/SPIRVToLLVMDialectConversion.md
@@ -84,7 +84,7 @@ at the moment. Hence, we adhere to the following mapping:
     size of the previous struct elements) are **not** supported. In this case,
     offsets can be emulated with padding fields (*e.g.* integers). However, such
     a design would require index recalculation in the conversion of ops that
-    involve memmory addressing.
+    involve memory addressing.
 
 Examples of SPIR-V struct conversion are:
 ```mlir
@@ -204,7 +204,7 @@ to note:
     be:
 
     ```mlir
-    // Zero extending offest after broadcasting
+    // Zero extending offset after broadcasting
     %res_offset = llvm.zext %vec_offset: !llvm.vec<2 x i8> to !llvm.vec<2 x i32>
     ```
 
@@ -515,7 +515,7 @@ Also, at the moment initialization is only possible via `spv.constant`.
                                                                llvm.store %c, %res : !llvm.ptr<i64>
 ```
 
-Note that simple conversion to `alloca` may not be sufficent if the code has
+Note that simple conversion to `alloca` may not be sufficient if the code has
 some scoping. For example, if converting ops executed in a loop into `alloca`s,
 a stack overflow may occur. For this case, `stacksave`/`stackrestore` pair can
 be used (TODO).
@@ -618,7 +618,7 @@ As well as:
 
 `spv.Branch` and `spv.BranchConditional` are mapped to `llvm.br` and
 `llvm.cond_br`. Branch weigths for `spv.BranchConditional` are mapped to
-coresponding `branch_weights` attribute of `llvm.cond_br`. When translated to
+corresponding `branch_weights` attribute of `llvm.cond_br`. When translated to
 proper LLVM, `branch_weights` are converted into LLVM metadata associated with
 the conditional branch.
 
@@ -744,7 +744,7 @@ to LLVM dialect.
 
 ### `spv.func`
 This op declares or defines a SPIR-V function and it is converted to `llvm.func`.
-This conversion handles signarture conversion, and function control attributes
+This conversion handles signature conversion, and function control attributes
 remapping to LLVM dialect function [`passthrough` attribute](Dialects/LLVM.md#Attribute-pass-through).
 
 The following mapping is used to map [SPIR-V function control](SPIRVFunctionAttributes) to

From 1596ea80fdf3410f94ef9a2548701d26cc81c2f5 Mon Sep 17 00:00:00 2001
From: AndreyChurbanov <andrey.churbanov@intel.com>
Date: Wed, 26 Aug 2020 21:56:01 +0300
Subject: [PATCH 14/19] [OpenMP] Fix import library installation with MinGW

Patch by mati865@gmail.com

Differential Revision: https://reviews.llvm.org/D86552
---
 openmp/runtime/src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 81275c0483dd4a..19423f58c6c4a2 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -195,7 +195,7 @@ if(WIN32)
   # the import library is "re-linked" to include kmp_import.cpp which prevents
   # linking of both Visual Studio OpenMP and newly built OpenMP
   set_source_files_properties(kmp_import.cpp PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_CXXFLAGS}")
-  set(LIBOMP_IMP_LIB_FILE ${LIBOMP_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(LIBOMP_IMP_LIB_FILE ${LIBOMP_LIB_NAME}${CMAKE_IMPORT_LIBRARY_SUFFIX})
   set(LIBOMP_GENERATED_IMP_LIB_FILENAME ${LIBOMP_LIB_FILE}${CMAKE_STATIC_LIBRARY_SUFFIX})
   set_target_properties(omp PROPERTIES
     VERSION ${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR} # uses /version flag

From 28fbf422f248fc74681a53208aa2f543a67515ac Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield@gmail.com>
Date: Wed, 26 Aug 2020 20:01:42 +0100
Subject: [PATCH 15/19] [libomptarget][amdgpu] Update plugin CMake to work with
 latest rocr library

---
 .../plugins/amdgpu/CMakeLists.txt             | 33 ++++++-------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
index 6498565babd86a..7483e4e5c0eae1 100644
--- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
@@ -13,22 +13,15 @@
 
 ################################################################################
 
-if(NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
-  libomptarget_say("Not building AMDGPU plugin: LIBELF not found")
+# as of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa
+find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+if (NOT ${hsa-runtime64_FOUND})
+  libomptarget_say("Not building HSA plugin: hsa-runtime64 not found")
   return()
 endif()
 
-# rocr cmake uses DHSAKMT_INC_PATH, DHSAKMT_LIB_PATH to find roct
-# following that, look for DHSA_INC_PATH, DHSA_LIB_PATH, which allows
-# builds to use source and library files from various locations
-
-if(ROCM_DIR)
-  set(HSA_INC_PATH ${ROCM_DIR}/hsa/include ${ROCM_DIR}/hsa/include/hsa)
-  set(HSA_LIB_PATH ${ROCM_DIR}/hsa/lib)
-  set(HSAKMT_INC_PATH "")
-  set(HSAKMT_LIB_PATH ${ROCM_DIR}/lib)
-elseif(NOT (HSA_INC_PATH AND HSA_LIB_PATH AND HSAKMT_INC_PATH AND HSAKMT_LIB_PATH))
-  libomptarget_say("Not building AMDGPU plugin: ROCM library paths unspecified")
+if(NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
+  libomptarget_say("Not building AMDGPU plugin: LIBELF not found")
   return()
 endif()
 
@@ -38,11 +31,6 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_
 endif()
 libomptarget_say("Building amdgpu offloading plugin")
 
-libomptarget_say("HSA plugin: HSA_INC_PATH: ${HSA_INC_PATH}")
-libomptarget_say("HSA plugin: HSA_LIB_PATH: ${HSA_LIB_PATH}")
-libomptarget_say("HSA plugin: HSAKMT_INC_PATH: ${HSAKMT_INC_PATH}")
-libomptarget_say("HSA plugin: HSAKMT_LIB_PATH: ${HSAKMT_LIB_PATH}")
-
 ################################################################################
 # Define the suffix for the runtime messaging dumps.
 add_definitions(-DTARGET_NAME=AMDGPU)
@@ -55,7 +43,6 @@ if(CMAKE_BUILD_TYPE MATCHES Debug)
 endif()
 
 include_directories(
-  ${HSA_INC_PATH}
   ${CMAKE_CURRENT_SOURCE_DIR}/impl
 )
 
@@ -74,12 +61,12 @@ add_library(omptarget.rtl.amdgpu SHARED
 # When we build for debug, OPENMP_LIBDIR_SUFFIX get set to -debug
 install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "lib${OPENMP_LIBDIR_SUFFIX}")
 
-add_dependencies(omptarget.rtl.amdgpu hsa-runtime64 hsakmt)
+set_property(TARGET omptarget.rtl.amdgpu PROPERTY INSTALL_RPATH "$ORIGIN")
 target_link_libraries(
   omptarget.rtl.amdgpu
-  -lpthread -ldl -Wl,-rpath,${OPENMP_INSTALL_LIBDIR}
-  -L${HSA_LIB_PATH} -L${HSAKMT_LIB_PATH} -lhsa-runtime64 -lhsakmt -Wl,-rpath,${HSA_LIB_PATH},-rpath,${HSAKMT_LIB_PATH}
-  -lelf
+  PRIVATE
+  hsa-runtime64::hsa-runtime64
+  pthread dl elf
   "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
   "-Wl,-z,defs"
   )

From ceffd6993c350b57f43cec3b6371b159fc4a3149 Mon Sep 17 00:00:00 2001
From: Aleksandr Platonov <platonov.aleksandr@huawei.com>
Date: Wed, 26 Aug 2020 22:10:35 +0300
Subject: [PATCH 16/19] [Support][Windows] Fix incorrect
 GetFinalPathNameByHandleW() return value check in realPathFromHandle()

`GetFinalPathNameByHandleW(,,N,)` returns:
- `< N` on success (this value does not include the size of the terminating null character)
- `>= N` if buffer is too small (this value includes the size of the terminating null character)

So, when `N == Buffer.capacity() - 1`, we need to resize buffer if return value is > `Buffer.capacity() - 2`.
Also, we can set `N` to `Buffer.capacity()`.

Thus, without this patch `realPathFromHandle()` returns unfilled buffer when length of the final path of the file is equal to `Buffer.capacity()` or `Buffer.capacity() - 1`.

Reviewed By: andrewng, amccarth

Differential Revision: https://reviews.llvm.org/D86564
---
 llvm/lib/Support/Windows/Path.inc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 0115161636c4b3..399b054d36bd20 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -19,7 +19,6 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/WindowsError.h"
 #include <fcntl.h>
-#include <io.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 
@@ -352,13 +351,13 @@ std::error_code is_local(const Twine &path, bool &result) {
 static std::error_code realPathFromHandle(HANDLE H,
                                           SmallVectorImpl<wchar_t> &Buffer) {
   DWORD CountChars = ::GetFinalPathNameByHandleW(
-      H, Buffer.begin(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
-  if (CountChars > Buffer.capacity()) {
+      H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED);
+  if (CountChars && CountChars >= Buffer.capacity()) {
     // The buffer wasn't big enough, try again.  In this case the return value
     // *does* indicate the size of the null terminator.
     Buffer.reserve(CountChars);
     CountChars = ::GetFinalPathNameByHandleW(
-        H, Buffer.data(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
+        H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED);
   }
   if (CountChars == 0)
     return mapWindowsError(GetLastError());

From c6c292da910578bdec76616c606da2d79b730667 Mon Sep 17 00:00:00 2001
From: aartbik <ajcbik@google.com>
Date: Wed, 26 Aug 2020 11:00:55 -0700
Subject: [PATCH 17/19] [llvm] [Thumb2] Test unusual length for active lane
 mask

Thumb2 test for the fixed issue with unusual length.

https://bugs.llvm.org/show_bug.cgi?id=47299

Reviewed By: SjoerdMeijer

Differential Revision: https://reviews.llvm.org/D86646
---
 llvm/test/CodeGen/Thumb2/active_lane_mask.ll | 102 ++++++++++++++++---
 1 file changed, 90 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 7696e6645195f8..116031cb895ffd 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -33,11 +33,88 @@ define <4 x i32> @v4i32(i32 %index, i32 %BTC, <4 x i32> %V1, <4 x i32> %V2) {
   ret <4 x i32> %select
 }
 
+define <7 x i32> @v7i32(i32 %index, i32 %BTC, <7 x i32> %V1, <7 x i32> %V2) {
+; CHECK-LABEL: v7i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    adr r3, .LCPI1_0
+; CHECK-NEXT:    vdup.32 q1, r1
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    vadd.i32 q2, q0, r1
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vcmp.u32 hi, q1, q2
+; CHECK-NEXT:    ldr r2, [sp, #32]
+; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.u32 hi, q0, q2
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    ldr r2, [sp, #36]
+; CHECK-NEXT:    vmov.32 q2[1], r2
+; CHECK-NEXT:    ldr r2, [sp, #40]
+; CHECK-NEXT:    vmov.32 q2[2], r2
+; CHECK-NEXT:    ldr r2, [sp, #44]
+; CHECK-NEXT:    vmov.32 q2[3], r2
+; CHECK-NEXT:    ldr r2, [sp]
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    ldr r2, [sp, #4]
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    ldr r2, [sp, #8]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    ldr r2, [sp, #12]
+; CHECK-NEXT:    vmov.32 q3[3], r2
+; CHECK-NEXT:    adr r2, .LCPI1_1
+; CHECK-NEXT:    vpsel q2, q3, q2
+; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r2]
+; CHECK-NEXT:    movw r2, #4095
+; CHECK-NEXT:    vadd.i32 q2, q2, r1
+; CHECK-NEXT:    vcmp.u32 hi, q1, q2
+; CHECK-NEXT:    vmrs r1, p0
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    ldr r1, [sp, #48]
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.u32 hi, q0, q2
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    ldr r1, [sp, #52]
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    ldr r1, [sp, #56]
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    ldr r1, [sp, #16]
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    ldr r1, [sp, #20]
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    ldr r1, [sp, #24]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    strd r3, r2, [r0, #16]
+; CHECK-NEXT:    str r1, [r0, #24]
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+; CHECK-NEXT:  .LCPI1_1:
+; CHECK-NEXT:    .long 4 @ 0x4
+; CHECK-NEXT:    .long 5 @ 0x5
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .zero 4
+  %active.lane.mask = call <7 x i1> @llvm.get.active.lane.mask.v7i1.i32(i32 %index, i32 %BTC)
+  %select = select <7 x i1> %active.lane.mask, <7 x i32> %V1, <7 x i32> %V2
+  ret <7 x i32> %select
+}
+
 define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) {
 ; CHECK-LABEL: v8i16:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    adr.w r12, .LCPI1_0
+; CHECK-NEXT:    adr.w r12, .LCPI2_0
 ; CHECK-NEXT:    vdup.32 q5, r1
 ; CHECK-NEXT:    vldrw.u32 q0, [r12]
 ; CHECK-NEXT:    vmov.i8 q1, #0x0
@@ -53,7 +130,7 @@ define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) {
 ; CHECK-NEXT:    vmov.16 q0[2], r1
 ; CHECK-NEXT:    vmov r1, s19
 ; CHECK-NEXT:    vmov.16 q0[3], r1
-; CHECK-NEXT:    adr r1, .LCPI1_1
+; CHECK-NEXT:    adr r1, .LCPI2_1
 ; CHECK-NEXT:    vldrw.u32 q4, [r1]
 ; CHECK-NEXT:    vadd.i32 q4, q4, r0
 ; CHECK-NEXT:    vcmp.u32 hi, q5, q4
@@ -102,12 +179,12 @@ define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) {
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:  .LCPI2_0:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 1 @ 0x1
 ; CHECK-NEXT:    .long 2 @ 0x2
 ; CHECK-NEXT:    .long 3 @ 0x3
-; CHECK-NEXT:  .LCPI1_1:
+; CHECK-NEXT:  .LCPI2_1:
 ; CHECK-NEXT:    .long 4 @ 0x4
 ; CHECK-NEXT:    .long 5 @ 0x5
 ; CHECK-NEXT:    .long 6 @ 0x6
@@ -122,7 +199,7 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    adr.w r12, .LCPI2_0
+; CHECK-NEXT:    adr.w r12, .LCPI3_0
 ; CHECK-NEXT:    vdup.32 q7, r1
 ; CHECK-NEXT:    vldrw.u32 q0, [r12]
 ; CHECK-NEXT:    vmov.i8 q5, #0x0
@@ -138,7 +215,7 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    vmov.16 q2[2], r1
 ; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    adr r1, .LCPI2_1
+; CHECK-NEXT:    adr r1, .LCPI3_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vadd.i32 q3, q0, r0
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q3
@@ -169,7 +246,7 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    vmov.8 q2[6], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[7]
 ; CHECK-NEXT:    vmov.8 q2[7], r1
-; CHECK-NEXT:    adr r1, .LCPI2_2
+; CHECK-NEXT:    adr r1, .LCPI3_2
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q0
@@ -183,7 +260,7 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    vmov.16 q0[2], r1
 ; CHECK-NEXT:    vmov r1, s27
 ; CHECK-NEXT:    vmov.16 q0[3], r1
-; CHECK-NEXT:    adr r1, .LCPI2_3
+; CHECK-NEXT:    adr r1, .LCPI3_3
 ; CHECK-NEXT:    vldrw.u32 q6, [r1]
 ; CHECK-NEXT:    vadd.i32 q6, q6, r0
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q6
@@ -308,22 +385,22 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI2_0:
+; CHECK-NEXT:  .LCPI3_0:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 1 @ 0x1
 ; CHECK-NEXT:    .long 2 @ 0x2
 ; CHECK-NEXT:    .long 3 @ 0x3
-; CHECK-NEXT:  .LCPI2_1:
+; CHECK-NEXT:  .LCPI3_1:
 ; CHECK-NEXT:    .long 4 @ 0x4
 ; CHECK-NEXT:    .long 5 @ 0x5
 ; CHECK-NEXT:    .long 6 @ 0x6
 ; CHECK-NEXT:    .long 7 @ 0x7
-; CHECK-NEXT:  .LCPI2_2:
+; CHECK-NEXT:  .LCPI3_2:
 ; CHECK-NEXT:    .long 8 @ 0x8
 ; CHECK-NEXT:    .long 9 @ 0x9
 ; CHECK-NEXT:    .long 10 @ 0xa
 ; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:  .LCPI2_3:
+; CHECK-NEXT:  .LCPI3_3:
 ; CHECK-NEXT:    .long 12 @ 0xc
 ; CHECK-NEXT:    .long 13 @ 0xd
 ; CHECK-NEXT:    .long 14 @ 0xe
@@ -334,5 +411,6 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) {
 }
 
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
+declare <7 x i1> @llvm.get.active.lane.mask.v7i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)

From 54a5dd485c4d04d142a58c9349ada0c897cbeae6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 26 Aug 2020 15:21:54 -0400
Subject: [PATCH 18/19] [DAGCombiner] allow store merging non-i8 truncated ops

We have a gap in our store merging capabilities for shift+truncate
patterns as discussed in:
https://llvm.org/PR46662

I generalized the code/comments for this function in earlier commits,
so we only need ease the type restriction and adjust the address/endian
checking to make this work.

AArch64 lets us switch endian to make sure that patterns are matched
either way.

Differential Revision: https://reviews.llvm.org/D86420
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  37 ++-
 .../test/CodeGen/AArch64/merge-trunc-store.ll | 240 ++++++++++++------
 llvm/test/CodeGen/X86/stores-merging.ll       |  18 +-
 3 files changed, 187 insertions(+), 108 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 40502f0c6993d8..445e2bff6c058a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6869,8 +6869,9 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
   SmallVector<StoreSDNode *, 8> Stores;
   for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
     // TODO: Allow unordered atomics when wider type is legal (see D66309)
-    if (Store->getMemoryVT() != MVT::i8 || !Store->isSimple() ||
-        Store->isIndexed())
+    EVT MemVT = Store->getMemoryVT();
+    if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) ||
+        !Store->isSimple() || Store->isIndexed())
       return SDValue();
     Stores.push_back(Store);
     Chain = Store->getChain();
@@ -6959,12 +6960,6 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
   assert(FirstOffset != INT64_MAX && "First byte offset must be set");
   assert(FirstStore && "First store must be set");
 
-  // Check if the bytes of the combined value we are looking at match with
-  // either big or little endian value store.
-  Optional<bool> IsBigEndian = isBigEndian(OffsetMap, FirstOffset);
-  if (!IsBigEndian.hasValue())
-    return SDValue();
-
   // Check that a store of the wide type is both allowed and fast on the target
   const DataLayout &Layout = DAG.getDataLayout();
   bool Fast = false;
@@ -6973,6 +6968,31 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
   if (!Allowed || !Fast)
     return SDValue();
 
+  // Check if the pieces of the value are going to the expected places in memory
+  // to merge the stores.
+  auto checkOffsets = [&](bool MatchLittleEndian) {
+    if (MatchLittleEndian) {
+      for (unsigned i = 0; i != NumStores; ++i)
+        if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset)
+          return false;
+    } else { // MatchBigEndian by reversing loop counter.
+      for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j)
+        if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset)
+          return false;
+    }
+    return true;
+  };
+
+  // Check if the offsets line up for the native data layout of this target.
+  bool NeedBswap = false;
+  if (!checkOffsets(Layout.isLittleEndian())) {
+    // Special-case: check if byte offsets line up for the opposite endian.
+    // TODO: We could use rotates for 16/32-bit merge pairs.
+    if (NarrowNumBits != 8 || !checkOffsets(Layout.isBigEndian()))
+      return SDValue();
+    NeedBswap = true;
+  }
+
   SDLoc DL(N);
   if (WideVT != SourceValue.getValueType()) {
     assert(SourceValue.getValueType().getSizeInBits() > WideNumBits &&
@@ -6983,7 +7003,6 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
   // Before legalize we can introduce illegal bswaps which will be later
   // converted to an explicit bswap sequence. This way we end up with a single
   // store and byte shuffling instead of several stores and byte shuffling.
-  bool NeedBswap = Layout.isBigEndian() != *IsBigEndian;
   if (NeedBswap)
     SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
 
diff --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
index 9d9ea3ec8951ee..3f8fa3e9e38379 100644
--- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
@@ -200,12 +200,17 @@ define void @be_i32_to_i8_order(i32 %x, i8* %p0) {
 }
 
 define void @le_i32_to_i16(i32 %x, i16* %p0) {
-; CHECK-LABEL: le_i32_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #16
-; CHECK-NEXT:    strh w0, [x1]
-; CHECK-NEXT:    strh w8, [x1, #2]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i32_to_i16:
+; LE:       // %bb.0:
+; LE-NEXT:    str w0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i32_to_i16:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr w8, w0, #16
+; BE-NEXT:    strh w0, [x1]
+; BE-NEXT:    strh w8, [x1, #2]
+; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
   %t1 = trunc i32 %sh1 to i16
@@ -216,12 +221,17 @@ define void @le_i32_to_i16(i32 %x, i16* %p0) {
 }
 
 define void @le_i32_to_i16_order(i32 %x, i16* %p0) {
-; CHECK-LABEL: le_i32_to_i16_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #16
-; CHECK-NEXT:    strh w8, [x1, #2]
-; CHECK-NEXT:    strh w0, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i32_to_i16_order:
+; LE:       // %bb.0:
+; LE-NEXT:    str w0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i32_to_i16_order:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr w8, w0, #16
+; BE-NEXT:    strh w8, [x1, #2]
+; BE-NEXT:    strh w0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
   %t1 = trunc i32 %sh1 to i16
@@ -232,12 +242,17 @@ define void @le_i32_to_i16_order(i32 %x, i16* %p0) {
 }
 
 define void @be_i32_to_i16(i32 %x, i16* %p0) {
-; CHECK-LABEL: be_i32_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #16
-; CHECK-NEXT:    strh w0, [x1, #2]
-; CHECK-NEXT:    strh w8, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i32_to_i16:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr w8, w0, #16
+; LE-NEXT:    strh w0, [x1, #2]
+; LE-NEXT:    strh w8, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i32_to_i16:
+; BE:       // %bb.0:
+; BE-NEXT:    str w0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
   %t1 = trunc i32 %sh1 to i16
@@ -248,12 +263,17 @@ define void @be_i32_to_i16(i32 %x, i16* %p0) {
 }
 
 define void @be_i32_to_i16_order(i32 %x, i16* %p0) {
-; CHECK-LABEL: be_i32_to_i16_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #16
-; CHECK-NEXT:    strh w8, [x1]
-; CHECK-NEXT:    strh w0, [x1, #2]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i32_to_i16_order:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr w8, w0, #16
+; LE-NEXT:    strh w8, [x1]
+; LE-NEXT:    strh w0, [x1, #2]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i32_to_i16_order:
+; BE:       // %bb.0:
+; BE-NEXT:    str w0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
   %t1 = trunc i32 %sh1 to i16
@@ -440,16 +460,21 @@ define void @be_i64_to_i8_order(i64 %x, i8* %p0) {
 }
 
 define void @le_i64_to_i16(i64 %x, i16* %p0) {
-; CHECK-LABEL: le_i64_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    lsr x9, x0, #32
-; CHECK-NEXT:    lsr x10, x0, #48
-; CHECK-NEXT:    strh w0, [x1]
-; CHECK-NEXT:    strh w8, [x1, #2]
-; CHECK-NEXT:    strh w9, [x1, #4]
-; CHECK-NEXT:    strh w10, [x1, #6]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i64_to_i16:
+; LE:       // %bb.0:
+; LE-NEXT:    str x0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i64_to_i16:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr x8, x0, #16
+; BE-NEXT:    lsr x9, x0, #32
+; BE-NEXT:    lsr x10, x0, #48
+; BE-NEXT:    strh w0, [x1]
+; BE-NEXT:    strh w8, [x1, #2]
+; BE-NEXT:    strh w9, [x1, #4]
+; BE-NEXT:    strh w10, [x1, #6]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 16
   %sh2 = lshr i64 %x, 32
   %sh3 = lshr i64 %x, 48
@@ -468,16 +493,21 @@ define void @le_i64_to_i16(i64 %x, i16* %p0) {
 }
 
 define void @le_i64_to_i16_order(i64 %x, i16* %p0) {
-; CHECK-LABEL: le_i64_to_i16_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    lsr x9, x0, #32
-; CHECK-NEXT:    lsr x10, x0, #48
-; CHECK-NEXT:    strh w0, [x1]
-; CHECK-NEXT:    strh w8, [x1, #2]
-; CHECK-NEXT:    strh w10, [x1, #6]
-; CHECK-NEXT:    strh w9, [x1, #4]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i64_to_i16_order:
+; LE:       // %bb.0:
+; LE-NEXT:    str x0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i64_to_i16_order:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr x8, x0, #16
+; BE-NEXT:    lsr x9, x0, #32
+; BE-NEXT:    lsr x10, x0, #48
+; BE-NEXT:    strh w0, [x1]
+; BE-NEXT:    strh w8, [x1, #2]
+; BE-NEXT:    strh w10, [x1, #6]
+; BE-NEXT:    strh w9, [x1, #4]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 16
   %sh2 = lshr i64 %x, 32
   %sh3 = lshr i64 %x, 48
@@ -496,16 +526,21 @@ define void @le_i64_to_i16_order(i64 %x, i16* %p0) {
 }
 
 define void @be_i64_to_i16(i64 %x, i16* %p0) {
-; CHECK-LABEL: be_i64_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    lsr x9, x0, #32
-; CHECK-NEXT:    lsr x10, x0, #48
-; CHECK-NEXT:    strh w0, [x1, #6]
-; CHECK-NEXT:    strh w8, [x1, #4]
-; CHECK-NEXT:    strh w9, [x1, #2]
-; CHECK-NEXT:    strh w10, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i64_to_i16:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr x8, x0, #16
+; LE-NEXT:    lsr x9, x0, #32
+; LE-NEXT:    lsr x10, x0, #48
+; LE-NEXT:    strh w0, [x1, #6]
+; LE-NEXT:    strh w8, [x1, #4]
+; LE-NEXT:    strh w9, [x1, #2]
+; LE-NEXT:    strh w10, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i64_to_i16:
+; BE:       // %bb.0:
+; BE-NEXT:    str x0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 16
   %sh2 = lshr i64 %x, 32
   %sh3 = lshr i64 %x, 48
@@ -524,16 +559,21 @@ define void @be_i64_to_i16(i64 %x, i16* %p0) {
 }
 
 define void @be_i64_to_i16_order(i64 %x, i16* %p0) {
-; CHECK-LABEL: be_i64_to_i16_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    lsr x9, x0, #32
-; CHECK-NEXT:    lsr x10, x0, #48
-; CHECK-NEXT:    strh w0, [x1, #6]
-; CHECK-NEXT:    strh w10, [x1]
-; CHECK-NEXT:    strh w9, [x1, #2]
-; CHECK-NEXT:    strh w8, [x1, #4]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i64_to_i16_order:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr x8, x0, #16
+; LE-NEXT:    lsr x9, x0, #32
+; LE-NEXT:    lsr x10, x0, #48
+; LE-NEXT:    strh w0, [x1, #6]
+; LE-NEXT:    strh w10, [x1]
+; LE-NEXT:    strh w9, [x1, #2]
+; LE-NEXT:    strh w8, [x1, #4]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i64_to_i16_order:
+; BE:       // %bb.0:
+; BE-NEXT:    str x0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 16
   %sh2 = lshr i64 %x, 32
   %sh3 = lshr i64 %x, 48
@@ -552,11 +592,16 @@ define void @be_i64_to_i16_order(i64 %x, i16* %p0) {
 }
 
 define void @le_i64_to_i32(i64 %x, i32* %p0) {
-; CHECK-LABEL: le_i64_to_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #32
-; CHECK-NEXT:    stp w0, w8, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i64_to_i32:
+; LE:       // %bb.0:
+; LE-NEXT:    str x0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i64_to_i32:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr x8, x0, #32
+; BE-NEXT:    stp w0, w8, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
   %t1 = trunc i64 %sh1 to i32
@@ -567,11 +612,16 @@ define void @le_i64_to_i32(i64 %x, i32* %p0) {
 }
 
 define void @le_i64_to_i32_order(i64 %x, i32* %p0) {
-; CHECK-LABEL: le_i64_to_i32_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #32
-; CHECK-NEXT:    stp w0, w8, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i64_to_i32_order:
+; LE:       // %bb.0:
+; LE-NEXT:    str x0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i64_to_i32_order:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr x8, x0, #32
+; BE-NEXT:    stp w0, w8, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
   %t1 = trunc i64 %sh1 to i32
@@ -582,11 +632,16 @@ define void @le_i64_to_i32_order(i64 %x, i32* %p0) {
 }
 
 define void @be_i64_to_i32(i64 %x, i32* %p0) {
-; CHECK-LABEL: be_i64_to_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #32
-; CHECK-NEXT:    stp w8, w0, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i64_to_i32:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr x8, x0, #32
+; LE-NEXT:    stp w8, w0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i64_to_i32:
+; BE:       // %bb.0:
+; BE-NEXT:    str x0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
   %t1 = trunc i64 %sh1 to i32
@@ -597,11 +652,16 @@ define void @be_i64_to_i32(i64 %x, i32* %p0) {
 }
 
 define void @be_i64_to_i32_order(i64 %x, i32* %p0) {
-; CHECK-LABEL: be_i64_to_i32_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #32
-; CHECK-NEXT:    stp w8, w0, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i64_to_i32_order:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr x8, x0, #32
+; LE-NEXT:    stp w8, w0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i64_to_i32_order:
+; BE:       // %bb.0:
+; BE-NEXT:    str x0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
   %t1 = trunc i64 %sh1 to i32
@@ -611,6 +671,8 @@ define void @be_i64_to_i32_order(i64 %x, i32* %p0) {
   ret void
 }
 
+; Negative test - not consecutive addresses
+
 define void @i64_to_i32_wrong_addr(i64 %x, i32* %p0) {
 ; CHECK-LABEL: i64_to_i32_wrong_addr:
 ; CHECK:       // %bb.0:
@@ -627,6 +689,8 @@ define void @i64_to_i32_wrong_addr(i64 %x, i32* %p0) {
   ret void
 }
 
+; Negative test - addresses don't line up with shift amounts
+
 define void @i64_to_i16_wrong_order(i64 %x, i16* %p0) {
 ; CHECK-LABEL: i64_to_i16_wrong_order:
 ; CHECK:       // %bb.0:
@@ -655,6 +719,8 @@ define void @i64_to_i16_wrong_order(i64 %x, i16* %p0) {
   ret void
 }
 
+; Negative test - no store of 't1'
+
 define void @i32_to_i8_incomplete(i32 %x, i8* %p0) {
 ; CHECK-LABEL: i32_to_i8_incomplete:
 ; CHECK:       // %bb.0:
@@ -680,6 +746,8 @@ define void @i32_to_i8_incomplete(i32 %x, i8* %p0) {
   ret void
 }
 
+; Negative test - no store of 't3'
+
 define void @i64_to_i8_incomplete(i64 %x, i8* %p0) {
 ; CHECK-LABEL: i64_to_i8_incomplete:
 ; CHECK:       // %bb.0:
@@ -729,6 +797,8 @@ define void @i64_to_i8_incomplete(i64 %x, i8* %p0) {
   ret void
 }
 
+; Negative test - not consecutive addresses
+
 define void @i32_to_i16_wrong_addr(i32 %x, i16* %p0) {
 ; CHECK-LABEL: i32_to_i16_wrong_addr:
 ; CHECK:       // %bb.0:
@@ -745,6 +815,8 @@ define void @i32_to_i16_wrong_addr(i32 %x, i16* %p0) {
   ret void
 }
 
+; Negative test - addresses don't line up with shift amounts
+
 define void @i32_to_i8_wrong_order(i32 %x, i8* %p0) {
 ; CHECK-LABEL: i32_to_i8_wrong_order:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll
index 4467fec9f2b456..85a086503410e8 100644
--- a/llvm/test/CodeGen/X86/stores-merging.ll
+++ b/llvm/test/CodeGen/X86/stores-merging.ll
@@ -468,9 +468,7 @@ define void @trunc_i32_to_i8(i32 %x, i8* %p) {
 define void @trunc_i32_to_i16(i32 %x, i16* %p) {
 ; CHECK-LABEL: trunc_i32_to_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw %di, (%rsi)
-; CHECK-NEXT:    shrl $16, %edi
-; CHECK-NEXT:    movw %di, 2(%rsi)
+; CHECK-NEXT:    movl %edi, (%rsi)
 ; CHECK-NEXT:    retq
   %t1 = trunc i32 %x to i16
   %sh = lshr i32 %x, 16
@@ -522,15 +520,7 @@ define void @trunc_i64_to_i8(i64 %x, i8* %p) {
 define void @trunc_i64_to_i16(i64 %x, i16* %p) {
 ; CHECK-LABEL: trunc_i64_to_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %rdi, %rcx
-; CHECK-NEXT:    movw %di, (%rsi)
-; CHECK-NEXT:    shrq $16, %rdi
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    shrq $48, %rcx
-; CHECK-NEXT:    movw %di, 2(%rsi)
-; CHECK-NEXT:    movw %ax, 4(%rsi)
-; CHECK-NEXT:    movw %cx, 6(%rsi)
+; CHECK-NEXT:    movq %rdi, (%rsi)
 ; CHECK-NEXT:    retq
   %t1 = trunc i64 %x to i16
   %sh1 = lshr i64 %x, 16
@@ -552,9 +542,7 @@ define void @trunc_i64_to_i16(i64 %x, i16* %p) {
 define void @trunc_i64_to_i32(i64 %x, i32* %p) {
 ; CHECK-LABEL: trunc_i64_to_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, (%rsi)
-; CHECK-NEXT:    shrq $32, %rdi
-; CHECK-NEXT:    movl %edi, 4(%rsi)
+; CHECK-NEXT:    movq %rdi, (%rsi)
 ; CHECK-NEXT:    retq
   %t1 = trunc i64 %x to i32
   %sh = lshr i64 %x, 32

From 9936455204fd6ab72715cc9d67385ddc93e072ed Mon Sep 17 00:00:00 2001
From: Owen Anderson <resistor@mac.com>
Date: Wed, 26 Aug 2020 19:36:13 +0000
Subject: [PATCH 19/19] Reapply D70800: Fix AArch64 AAPCS frame record chain

Original Commit Message:
After the commit r368987 (rG643adb55769e) was landed, the frame record (FP and LR register)
may be placed in the middle of a stack frame if a function has both callee-saved
general-purpose registers and floating point registers. This will break the stack unwinders
that simply walk through the frame records (based on the guarantee from AAPCS64
"The Frame Pointer" section). This commit fixes the problem by adding the frame record offset.

Patch By: logan
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 36 ++++++++++---------
 .../AArch64/AArch64MachineFunctionInfo.h      | 11 ++++++
 .../CodeGen/AArch64/framelayout-fp-csr.ll     | 22 ++++++++++++
 .../AArch64/framelayout-frame-record.mir      | 29 +++++++++++++++
 4 files changed, 81 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/framelayout-fp-csr.ll
 create mode 100644 llvm/test/CodeGen/AArch64/framelayout-frame-record.mir

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c6cc6e9e847183..751791bdb354a0 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1024,10 +1024,6 @@ static bool needsWinCFI(const MachineFunction &MF) {
          F.needsUnwindTableEntry();
 }
 
-static bool isTargetDarwin(const MachineFunction &MF) {
-  return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
-}
-
 static bool isTargetWindows(const MachineFunction &MF) {
   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
 }
@@ -1185,7 +1181,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // For funclets the FP belongs to the containing function.
   if (!IsFunclet && HasFP) {
     // Only set up FP if we actually need to.
-    int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
+    int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
 
     if (CombineSPBump)
       FPOffset += AFI->getLocalStackSize();
@@ -1409,11 +1405,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (needsFrameMoves) {
-    const DataLayout &TD = MF.getDataLayout();
-    const int StackGrowth = isTargetDarwin(MF)
-                                ? (2 * -TD.getPointerSize(0))
-                                : -AFI->getCalleeSavedStackSize();
-    Register FramePtr = RegInfo->getFrameRegister(MF);
     // An example of the prologue:
     //
     //     .globl __foo
@@ -1481,10 +1472,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     //     .cfi_offset w28, -32
 
     if (HasFP) {
+      const int OffsetToFirstCalleeSaveFromFP =
+          AFI->getCalleeSaveBaseToFrameRecordOffset() -
+          AFI->getCalleeSavedStackSize();
+      Register FramePtr = RegInfo->getFrameRegister(MF);
+
       // Define the current CFA rule to use the provided FP.
       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth));
+          MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
@@ -1775,10 +1771,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // non-post-indexed loads for the restores if we aren't actually going to
   // be able to save any instructions.
   if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
-    int64_t OffsetToFrameRecord =
-        isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0;
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
-                    {OffsetToFrameRecord, MVT::i8},
+                    {-AFI->getCalleeSaveBaseToFrameRecordOffset(), MVT::i8},
                     TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
   } else if (NumBytes)
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
@@ -1839,11 +1833,11 @@ static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset)
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-
   unsigned FixedObject =
       getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
-  unsigned FPAdjust = isTargetDarwin(MF)
-                        ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
+  int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
+  int64_t FPAdjust =
+      CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
   return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
 }
 
@@ -2231,6 +2225,14 @@ static void computeCalleeSaveRegisterPairs(
             (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
            "Offset out of bounds for LDP/STP immediate");
 
+    // Save the offset to frame record so that the FP register can point to the
+    // innermost frame record (spilled FP and LR registers).
+    if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
+                              RPI.Reg2 == AArch64::FP) ||
+                             (IsWindows && RPI.Reg1 == AArch64::FP &&
+                              RPI.Reg2 == AArch64::LR)))
+      AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
+
     RegPairs.push_back(RPI);
     if (RPI.isPaired())
       ++i;
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 84aa53f2bece19..9562269336d8d8 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -135,6 +135,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// e.g. Tail Call, Thunk, or Function if none apply.
   Optional<std::string> OutliningStyle;
 
+  // Offset from SP-after-callee-saved-spills (i.e. SP-at-entry minus
+  // CalleeSavedStackSize) to the address of the frame record.
+  int CalleeSaveBaseToFrameRecordOffset = 0;
+
 public:
   AArch64FunctionInfo() = default;
 
@@ -338,6 +342,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     TaggedBasePointerOffset = Offset;
   }
 
+  int getCalleeSaveBaseToFrameRecordOffset() const {
+    return CalleeSaveBaseToFrameRecordOffset;
+  }
+  void setCalleeSaveBaseToFrameRecordOffset(int Offset) {
+    CalleeSaveBaseToFrameRecordOffset = Offset;
+  }
+
 private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;
diff --git a/llvm/test/CodeGen/AArch64/framelayout-fp-csr.ll b/llvm/test/CodeGen/AArch64/framelayout-fp-csr.ll
new file mode 100644
index 00000000000000..3b13dee29f0696
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-fp-csr.ll
@@ -0,0 +1,22 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-post-ra --frame-pointer=all < %s | FileCheck %s
+
+; The purpose of this test is to verify that frame pointer (x29)
+; is correctly setup in the presence of callee-saved floating
+; point registers.  The frame pointer should point to the frame
+; record, which is located 16 bytes above the end of the CSR
+; space when a single FP CSR is in use.
+define void @test1(i32) #26 {
+entry:
+  call void asm sideeffect "nop", "~{d8}"() #26
+  ret void
+}
+; CHECK-LABEL: test1:
+; CHECK:       str     d8, [sp, #-32]!
+; CHECK-NEXT:  stp     x29, x30, [sp, #16]
+; CHECK-NEXT:  add     x29, sp, #16
+; CHECK:       nop
+; CHECK:       ldp     x29, x30, [sp, #16]
+; CHECK-NEXT:  ldr     d8, [sp], #32
+; CHECK-NEXT:  ret
+
+attributes #26 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir b/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir
new file mode 100644
index 00000000000000..ab4af04401c5e2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir
@@ -0,0 +1,29 @@
+# RUN: llc -mtriple=aarch64-linux-gnu -start-before prologepilog %s -o - | FileCheck %s
+
+---
+name: TestFrameRecordLocation
+tracksRegLiveness: true
+frameInfo:
+  isFrameAddressTaken: true
+body: |
+  bb.0:
+    $d8 = IMPLICIT_DEF
+    $d9 = IMPLICIT_DEF
+    $x19 = IMPLICIT_DEF
+    RET_ReallyLR
+
+# CHECK-LABEL: TestFrameRecordLocation
+
+# CHECK: stp d9, d8, [sp, #-48]!
+# CHECK: stp x29, x30, [sp, #16]
+# CHECK: str x19, [sp, #32]
+
+# CHECK: add x29, sp, #16
+
+# CHECK: .cfi_def_cfa w29, 32
+# CHECK: .cfi_offset w19, -16
+# CHECK: .cfi_offset w30, -24
+# CHECK: .cfi_offset w29, -32
+# CHECK: .cfi_offset b8, -40
+# CHECK: .cfi_offset b9, -48
+...