diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index a32bc0c84c7018..ecc4e7ee19fbc1 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -9374,16 +9374,22 @@ static Comparison compareEnableIfAttrs(const Sema &S, const FunctionDecl *Cand1,
   return Comparison::Equal;
 }
 
-static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1,
-                                          const OverloadCandidate &Cand2) {
+static Comparison
+isBetterMultiversionCandidate(const OverloadCandidate &Cand1,
+                              const OverloadCandidate &Cand2) {
   if (!Cand1.Function || !Cand1.Function->isMultiVersion() || !Cand2.Function ||
       !Cand2.Function->isMultiVersion())
-    return false;
+    return Comparison::Equal;
 
-  // If Cand1 is invalid, it cannot be a better match, if Cand2 is invalid, this
-  // is obviously better.
-  if (Cand1.Function->isInvalidDecl()) return false;
-  if (Cand2.Function->isInvalidDecl()) return true;
+  // If both are invalid, they are equal. If one of them is invalid, the other
+  // is better.
+  if (Cand1.Function->isInvalidDecl()) {
+    if (Cand2.Function->isInvalidDecl())
+      return Comparison::Equal;
+    return Comparison::Worse;
+  }
+  if (Cand2.Function->isInvalidDecl())
+    return Comparison::Better;
 
   // If this is a cpu_dispatch/cpu_specific multiversion situation, prefer
   // cpu_dispatch, else arbitrarily based on the identifiers.
@@ -9393,16 +9399,18 @@ static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1,
   const auto *Cand2CPUSpec = Cand2.Function->getAttr<CPUSpecificAttr>();
 
   if (!Cand1CPUDisp && !Cand2CPUDisp && !Cand1CPUSpec && !Cand2CPUSpec)
-    return false;
+    return Comparison::Equal;
 
   if (Cand1CPUDisp && !Cand2CPUDisp)
-    return true;
+    return Comparison::Better;
   if (Cand2CPUDisp && !Cand1CPUDisp)
-    return false;
+    return Comparison::Worse;
 
   if (Cand1CPUSpec && Cand2CPUSpec) {
     if (Cand1CPUSpec->cpus_size() != Cand2CPUSpec->cpus_size())
-      return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size();
+      return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size()
+                 ? Comparison::Better
+                 : Comparison::Worse;
 
     std::pair<CPUSpecificAttr::cpus_iterator, CPUSpecificAttr::cpus_iterator>
         FirstDiff = std::mismatch(
@@ -9415,7 +9423,9 @@ static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1,
     assert(FirstDiff.first != Cand1CPUSpec->cpus_end() &&
            "Two different cpu-specific versions should not have the same "
            "identifier list, otherwise they'd be the same decl!");
-    return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName();
+    return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName()
+               ? Comparison::Better
+               : Comparison::Worse;
   }
   llvm_unreachable("No way to get here unless both had cpu_dispatch");
 }
@@ -9475,6 +9485,50 @@ bool clang::isBetterOverloadCandidate(
   else if (!Cand1.Viable)
     return false;
 
+  // [CUDA] A function with 'never' preference is marked not viable, therefore
+  // is never shown up here. The worst preference shown up here is 'wrong side',
+  // e.g. a host function called by a device host function in device
+  // compilation. This is valid AST as long as the host device function is not
+  // emitted, e.g. it is an inline function which is called only by a host
+  // function. A deferred diagnostic will be triggered if it is emitted.
+  // However a wrong-sided function is still a viable candidate here.
+  //
+  // If Cand1 can be emitted and Cand2 cannot be emitted in the current
+  // context, Cand1 is better than Cand2. If Cand1 can not be emitted and Cand2
+  // can be emitted, Cand1 is not better than Cand2. This rule should have
+  // precedence over other rules.
+  //
+  // If both Cand1 and Cand2 can be emitted, or neither can be emitted, then
+  // other rules should be used to determine which is better. This is because
+  // host/device based overloading resolution is mostly for determining
+  // viability of a function. If two functions are both viable, other factors
+  // should take precedence in preference, e.g. the standard-defined preferences
+  // like argument conversion ranks or enable_if partial-ordering. The
+  // preference for pass-object-size parameters is probably most similar to a
+  // type-based-overloading decision and so should take priority.
+  //
+  // If other rules cannot determine which is better, CUDA preference will be
+  // used again to determine which is better.
+  //
+  // TODO: Currently IdentifyCUDAPreference does not return correct values
+  // for functions called in global variable initializers due to missing
+  // correct context about device/host. Therefore we can only enforce this
+  // rule when there is a caller. We should enforce this rule for functions
+  // in global variable initializers once proper context is added.
+  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
+    if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext)) {
+      auto P1 = S.IdentifyCUDAPreference(Caller, Cand1.Function);
+      auto P2 = S.IdentifyCUDAPreference(Caller, Cand2.Function);
+      assert(P1 != Sema::CFP_Never && P2 != Sema::CFP_Never);
+      auto Cand1Emittable = P1 > Sema::CFP_WrongSide;
+      auto Cand2Emittable = P2 > Sema::CFP_WrongSide;
+      if (Cand1Emittable && !Cand2Emittable)
+        return true;
+      if (!Cand1Emittable && Cand2Emittable)
+        return false;
+    }
+  }
+
   // C++ [over.match.best]p1:
   //
   //   -- if F is a static member function, ICS1(F) is defined such
@@ -9709,12 +9763,6 @@ bool clang::isBetterOverloadCandidate(
       return Cmp == Comparison::Better;
   }
 
-  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
-    FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
-    return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
-           S.IdentifyCUDAPreference(Caller, Cand2.Function);
-  }
-
   bool HasPS1 = Cand1.Function != nullptr &&
                 functionHasPassObjectSizeParams(Cand1.Function);
   bool HasPS2 = Cand2.Function != nullptr &&
@@ -9722,7 +9770,22 @@ bool clang::isBetterOverloadCandidate(
   if (HasPS1 != HasPS2 && HasPS1)
     return true;
 
-  return isBetterMultiversionCandidate(Cand1, Cand2);
+  auto MV = isBetterMultiversionCandidate(Cand1, Cand2);
+  if (MV == Comparison::Better)
+    return true;
+  if (MV == Comparison::Worse)
+    return false;
+
+  // If other rules cannot determine which is better, CUDA preference is used
+  // to determine which is better.
+  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
+    if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext)) {
+      return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
+             S.IdentifyCUDAPreference(Caller, Cand2.Function);
+    }
+  }
+
+  return false;
 }
 
 /// Determine whether two declarations are "equivalent" for the purposes of
@@ -9808,33 +9871,6 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
   std::transform(begin(), end(), std::back_inserter(Candidates),
                  [](OverloadCandidate &Cand) { return &Cand; });
 
-  // [CUDA] HD->H or HD->D calls are technically not allowed by CUDA but
-  // are accepted by both clang and NVCC. However, during a particular
-  // compilation mode only one call variant is viable. We need to
-  // exclude non-viable overload candidates from consideration based
-  // only on their host/device attributes. Specifically, if one
-  // candidate call is WrongSide and the other is SameSide, we ignore
-  // the WrongSide candidate.
-  if (S.getLangOpts().CUDA) {
-    const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
-    bool ContainsSameSideCandidate =
-        llvm::any_of(Candidates, [&](OverloadCandidate *Cand) {
-          // Check viable function only.
-          return Cand->Viable && Cand->Function &&
-                 S.IdentifyCUDAPreference(Caller, Cand->Function) ==
-                     Sema::CFP_SameSide;
-        });
-    if (ContainsSameSideCandidate) {
-      auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) {
-        // Check viable function only to avoid unnecessary data copying/moving.
-        return Cand->Viable && Cand->Function &&
-               S.IdentifyCUDAPreference(Caller, Cand->Function) ==
-                   Sema::CFP_WrongSide;
-      };
-      llvm::erase_if(Candidates, IsWrongSideCandidate);
-    }
-  }
-
   // Find the best viable function.
   Best = end();
   for (auto *Cand : Candidates) {
diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu
index b9efd1c09e6994..b0e2852a12a755 100644
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@@ -331,9 +331,6 @@ __device__ void test_device_calls_template_fn() {
 // If we have a mix of HD and H-only or D-only candidates in the overload set,
 // normal C++ overload resolution rules apply first.
 template <typename T> TemplateReturnTy template_vs_hd_function(T arg)
-#ifdef __CUDA_ARCH__
-//expected-note@-2 {{declared here}}
-#endif
 {
   return TemplateReturnTy();
 }
@@ -342,11 +339,13 @@ __host__ __device__ HostDeviceReturnTy template_vs_hd_function(float arg) {
 }
 
 __host__ __device__ void test_host_device_calls_hd_template() {
-  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
-  TemplateReturnTy ret2 = template_vs_hd_function(1);
 #ifdef __CUDA_ARCH__
-  // expected-error@-2 {{reference to __host__ function 'template_vs_hd_function<int>' in __host__ __device__ function}}
+  typedef HostDeviceReturnTy ExpectedReturnTy;
+#else
+  typedef TemplateReturnTy ExpectedReturnTy;
 #endif
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  ExpectedReturnTy ret2 = template_vs_hd_function(1);
 }
 
 __host__ void test_host_calls_hd_template() {
@@ -367,14 +366,14 @@ __device__ void test_device_calls_hd_template() {
 __device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); }
 __device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); }
 #ifndef __CUDA_ARCH__
-  // expected-note@-3 {{'device_only_function' declared here}}
-  // expected-note@-3 {{'device_only_function' declared here}}
+  // expected-note@-3 2{{'device_only_function' declared here}}
+  // expected-note@-3 2{{'device_only_function' declared here}}
 #endif
 __host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); }
 __host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); }
 #ifdef __CUDA_ARCH__
-  // expected-note@-3 {{'host_only_function' declared here}}
-  // expected-note@-3 {{'host_only_function' declared here}}
+  // expected-note@-3 2{{'host_only_function' declared here}}
+  // expected-note@-3 2{{'host_only_function' declared here}}
 #endif
 
 __host__ __device__ void test_host_device_single_side_overloading() {
@@ -392,6 +391,37 @@ __host__ __device__ void test_host_device_single_side_overloading() {
 #endif
 }
 
+// wrong-sided overloading should not cause diagnostic unless it is emitted.
+// This inline function is not emitted.
+inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_no_diag() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+}
+
+// wrong-sided overloading should cause diagnostic if it is emitted.
+// This inline function is emitted since it is called by an emitted function.
+inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_diag() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+#ifndef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+#endif
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+#ifdef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+#endif
+}
+
+__host__ __device__ void test_host_device_wrong_side_overloading_inline_diag_caller() {
+  test_host_device_wrong_side_overloading_inline_diag();
+  // expected-note@-1 {{called by 'test_host_device_wrong_side_overloading_inline_diag_caller'}}
+}
+
 // Verify that we allow overloading function templates.
 template <typename T> __host__ T template_overload(const T &a) { return a; };
 template <typename T> __device__ T template_overload(const T &a) { return a; };
diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp
index 29dad43b62eba4..082aaf41155afb 100644
--- a/compiler-rt/lib/xray/xray_interface.cpp
+++ b/compiler-rt/lib/xray/xray_interface.cpp
@@ -295,7 +295,7 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
 
   for (std::size_t I = 0; I < InstrMap.Entries; ++I) {
     auto &Sled = InstrMap.Sleds[I];
-    auto F = Sled.Function;
+    auto F = Sled.function();
     if (CurFun == 0)
       CurFun = F;
     if (F != CurFun) {
@@ -466,7 +466,7 @@ uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
   SpinMutexLock Guard(&XRayInstrMapMutex);
   if (FuncId <= 0 || static_cast<size_t>(FuncId) > XRayInstrMap.Functions)
     return 0;
-  return XRayInstrMap.SledsIndex[FuncId - 1].Begin->Function
+  return XRayInstrMap.SledsIndex[FuncId - 1].Begin->function()
 // On PPC, function entries are always aligned to 16 bytes. The beginning of a
 // sled might be a local entry, which is always +8 based on the global entry.
 // Always return the global entry.
diff --git a/compiler-rt/lib/xray/xray_interface_internal.h b/compiler-rt/lib/xray/xray_interface_internal.h
index cdd1b9cbe2d669..390f389b1dca66 100644
--- a/compiler-rt/lib/xray/xray_interface_internal.h
+++ b/compiler-rt/lib/xray/xray_interface_internal.h
@@ -29,6 +29,12 @@ struct XRaySledEntry {
   unsigned char AlwaysInstrument;
   unsigned char Version;
   unsigned char Padding[13]; // Need 32 bytes
+  uint64_t function() const {
+    if (Version < 2)
+      return Function;
+    // The target address is relative to the location of the Function variable.
+    return reinterpret_cast<uint64_t>(&Function) + Function;
+  }
   uint64_t address() const {
     if (Version < 2)
       return Address;
@@ -42,6 +48,12 @@ struct XRaySledEntry {
   unsigned char AlwaysInstrument;
   unsigned char Version;
   unsigned char Padding[5]; // Need 16 bytes
+  uint32_t function() const {
+    if (Version < 2)
+      return Function;
+    // The target address is relative to the location of the Function variable.
+    return reinterpret_cast<uint32_t>(&Function) + Function;
+  }
   uint32_t address() const {
     if (Version < 2)
       return Address;
diff --git a/flang/runtime/character.cpp b/flang/runtime/character.cpp
index b6a804dfa03f73..e65ac38dee874c 100644
--- a/flang/runtime/character.cpp
+++ b/flang/runtime/character.cpp
@@ -7,11 +7,60 @@
 //===----------------------------------------------------------------------===//
 
 #include "character.h"
+#include "descriptor.h"
 #include "terminator.h"
 #include <algorithm>
 #include <cstring>
 
 namespace Fortran::runtime {
+
+template <typename C>
+inline int CompareToBlankPadding(const C *x, std::size_t chars) {
+  for (; chars-- > 0; ++x) {
+    if (*x < ' ') {
+      return -1;
+    }
+    if (*x > ' ') {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+template <typename C, int shift>
+static int Compare(
+    const C *x, const C *y, std::size_t xBytes, std::size_t yBytes) {
+  auto minBytes{std::min(xBytes, yBytes)};
+  if constexpr (shift == 0) {
+    // don't use for kind=2 or =4, that would fail on little-endian machines
+    int cmp{std::memcmp(x, y, minBytes)};
+    if (cmp < 0) {
+      return -1;
+    }
+    if (cmp > 0) {
+      return 1;
+    }
+    if (xBytes == yBytes) {
+      return 0;
+    }
+    x += minBytes;
+    y += minBytes;
+  } else {
+    for (std::size_t n{minBytes >> shift}; n-- > 0; ++x, ++y) {
+      if (*x < *y) {
+        return -1;
+      }
+      if (*x > *y) {
+        return 1;
+      }
+    }
+  }
+  if (int cmp{CompareToBlankPadding(x, (xBytes - minBytes) >> shift)}) {
+    return cmp;
+  }
+  return -CompareToBlankPadding(y, (yBytes - minBytes) >> shift);
+}
+
 extern "C" {
 
 void RTNAME(CharacterConcatenate)(Descriptor & /*temp*/,
@@ -30,18 +79,43 @@ void RTNAME(CharacterAssign)(Descriptor & /*lhs*/, const Descriptor & /*rhs*/,
   // TODO
 }
 
-std::size_t RTNAME(CharacterAppend)(char *lhs, std::size_t lhsLength,
-    std::size_t offset, const char *rhs, std::size_t rhsLength) {
-  if (auto n{std::min(lhsLength - offset, rhsLength)}) {
+int RTNAME(CharacterCompareScalar)(const Descriptor &, const Descriptor &) {
+  // TODO real soon once there's type codes for character(kind=2 & 4)
+  return 0;
+}
+
+int RTNAME(CharacterCompareScalar1)(
+    const char *x, const char *y, std::size_t xBytes, std::size_t yBytes) {
+  return Compare<char, 0>(x, y, xBytes, yBytes);
+}
+
+int RTNAME(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
+    std::size_t xBytes, std::size_t yBytes) {
+  return Compare<char16_t, 1>(x, y, xBytes, yBytes);
+}
+
+int RTNAME(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
+    std::size_t xBytes, std::size_t yBytes) {
+  return Compare<char32_t, 2>(x, y, xBytes, yBytes);
+}
+
+void RTNAME(CharacterCompare)(
+    Descriptor &, const Descriptor &, const Descriptor &) {
+  // TODO real soon once there's type codes for character(kind=2 & 4)
+}
+
+std::size_t RTNAME(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
+    std::size_t offset, const char *rhs, std::size_t rhsBytes) {
+  if (auto n{std::min(lhsBytes - offset, rhsBytes)}) {
     std::memcpy(lhs + offset, rhs, n);
     offset += n;
   }
   return offset;
 }
 
-void RTNAME(CharacterPad)(char *lhs, std::size_t length, std::size_t offset) {
-  if (length > offset) {
-    std::memset(lhs + offset, ' ', length - offset);
+void RTNAME(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset) {
+  if (bytes > offset) {
+    std::memset(lhs + offset, ' ', bytes - offset);
   }
 }
 }
diff --git a/flang/runtime/character.h b/flang/runtime/character.h
index ff182dec54457a..6705d98bc8f041 100644
--- a/flang/runtime/character.h
+++ b/flang/runtime/character.h
@@ -11,11 +11,13 @@
 
 #ifndef FORTRAN_RUNTIME_CHARACTER_H_
 #define FORTRAN_RUNTIME_CHARACTER_H_
-#include "descriptor.h"
 #include "entry-names.h"
 #include <cstddef>
 
 namespace Fortran::runtime {
+
+class Descriptor;
+
 extern "C" {
 
 // Appends the corresponding (or expanded) characters of 'operand'
@@ -26,8 +28,8 @@ extern "C" {
 void RTNAME(CharacterConcatenate)(Descriptor &temp, const Descriptor &operand,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
-// Convenience specialization for character scalars.
-void RTNAME(CharacterConcatenateScalar)(
+// Convenience specialization for ASCII scalars.
+void RTNAME(CharacterConcatenateScalar1)(
     Descriptor &temp, const char *, std::size_t byteLength);
 
 // Assigns the value(s) of 'rhs' to 'lhs'.  Handles reallocation,
@@ -38,16 +40,36 @@ void RTNAME(CharacterConcatenateScalar)(
 void RTNAME(CharacterAssign)(Descriptor &lhs, const Descriptor &rhs,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
-// Special-case support for optimized scalar CHARACTER concatenation
-// expressions.
+// CHARACTER comparisons.  The kinds must match.  Like std::memcmp(),
+// the result is less than zero, zero, or greater than zero if the first
+// argument is less than the second, equal to the second, or greater than
+// the second, respectively.  The shorter argument is treated as if it were
+// padded on the right with blanks.
+// N.B.: Calls to the restricted specific intrinsic functions LGE, LGT, LLE,
+// & LLT are converted into calls to these during lowering; they don't have
+// to be able to be passed as actual procedure arguments.
+int RTNAME(CharacterCompareScalar)(const Descriptor &, const Descriptor &);
+int RTNAME(CharacterCompareScalar1)(
+    const char *x, const char *y, std::size_t xBytes, std::size_t yBytes);
+int RTNAME(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
+    std::size_t xBytes, std::size_t yBytes);
+int RTNAME(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
+    std::size_t xBytes, std::size_t yBytes);
+
+// General CHARACTER comparison; the result is a LOGICAL(KIND=1) array that
+// is established and populated.
+void RTNAME(CharacterCompare)(
+    Descriptor &result, const Descriptor &, const Descriptor &);
+
+// Special-case support for optimized ASCII scalar expressions.
 
 // Copies data from 'rhs' to the remaining space (lhsLength - offset)
 // in 'lhs', if any.  Returns the new offset.  Assumes independence.
-std::size_t RTNAME(CharacterAppend)(char *lhs, std::size_t lhsLength,
-    std::size_t offset, const char *rhs, std::size_t rhsLength);
+std::size_t RTNAME(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
+    std::size_t offset, const char *rhs, std::size_t rhsBytes);
 
 // Appends any necessary spaces to a CHARACTER(KIND=1) scalar.
-void RTNAME(CharacterPad)(char *lhs, std::size_t length, std::size_t offset);
+void RTNAME(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset);
 }
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_CHARACTER_H_
diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt
index a5297ac67821f6..4d6ac6411fe2bd 100644
--- a/flang/unittests/Runtime/CMakeLists.txt
+++ b/flang/unittests/Runtime/CMakeLists.txt
@@ -5,15 +5,15 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_library(RuntimeTesting
   testing.cpp
-  )
+)
 
 add_executable(format-test
   format.cpp
 )
 
 target_link_libraries(format-test
-  FortranRuntime
   RuntimeTesting
+  FortranRuntime
   LLVMSupport
 )
 
@@ -24,8 +24,8 @@ add_executable(hello-world
 )
 
 target_link_libraries(hello-world
-  FortranRuntime
   RuntimeTesting
+  FortranRuntime
   LLVMSupport
 )
 
@@ -42,12 +42,24 @@ target_link_libraries(external-hello-world
 
 add_executable(list-input-test
   list-input.cpp
-  )
+)
 
 target_link_libraries(list-input-test
-  FortranRuntime
   RuntimeTesting
+  FortranRuntime
   LLVMSupport
 )
 
 add_test(NAME ListInput COMMAND list-input-test)
+
+add_executable(character-test
+  character.cpp
+)
+
+target_link_libraries(character-test
+  RuntimeTesting
+  FortranRuntime
+  LLVMSupport
+)
+
+add_test(NAME CharacterTest COMMAND character-test)
diff --git a/flang/unittests/Runtime/character.cpp b/flang/unittests/Runtime/character.cpp
new file mode 100644
index 00000000000000..fb023473f64aaa
--- /dev/null
+++ b/flang/unittests/Runtime/character.cpp
@@ -0,0 +1,59 @@
+// Basic sanity tests of CHARACTER API; exhaustive testing will be done
+// in Fortran.
+
+#include "../../runtime/character.h"
+#include "testing.h"
+#include <cstring>
+
+using namespace Fortran::runtime;
+
+static void AppendAndPad(std::size_t limit) {
+  char x[8];
+  std::size_t xLen{0};
+  std::memset(x, 0, sizeof x);
+  xLen = RTNAME(CharacterAppend1)(x, limit, xLen, "abc", 3);
+  xLen = RTNAME(CharacterAppend1)(x, limit, xLen, "DE", 2);
+  RTNAME(CharacterPad1)(x, limit, xLen);
+  if (xLen > limit) {
+    Fail() << "xLen " << xLen << ">" << limit << '\n';
+  }
+  if (x[limit]) {
+    Fail() << "x[" << limit << "]='" << x[limit] << "'\n";
+    x[limit] = '\0';
+  }
+  if (std::memcmp(x, "abcDE   ", limit)) {
+    Fail() << "x = '" << x << "'\n";
+  }
+}
+
+static void TestCharCompare(const char *x, const char *y, std::size_t xBytes,
+    std::size_t yBytes, int expect) {
+  int cmp{RTNAME(CharacterCompareScalar1)(x, y, xBytes, yBytes)};
+  if (cmp != expect) {
+    char buf[2][8];
+    std::memset(buf, 0, sizeof buf);
+    std::memcpy(buf[0], x, xBytes);
+    std::memcpy(buf[1], y, yBytes);
+    Fail() << "compare '" << buf[0] << "'(" << xBytes << ") to '" << buf[1]
+           << "'(" << yBytes << "), got " << cmp << ", should be " << expect
+           << '\n';
+  }
+}
+
+static void Compare(const char *x, const char *y, std::size_t xBytes,
+    std::size_t yBytes, int expect) {
+  TestCharCompare(x, y, xBytes, yBytes, expect);
+  TestCharCompare(y, x, yBytes, xBytes, -expect);
+}
+
+int main() {
+  StartTests();
+  for (std::size_t j{0}; j < 8; ++j) {
+    AppendAndPad(j);
+  }
+  Compare("abc", "abc", 3, 3, 0);
+  Compare("abc", "def", 3, 3, -1);
+  Compare("ab ", "abc", 3, 2, 0);
+  Compare("abc", "abc", 2, 3, -1);
+  return EndTests();
+}
diff --git a/flang/unittests/Runtime/format.cpp b/flang/unittests/Runtime/format.cpp
index c855523b427e15..87989eacebcbec 100644
--- a/flang/unittests/Runtime/format.cpp
+++ b/flang/unittests/Runtime/format.cpp
@@ -3,7 +3,6 @@
 #include "testing.h"
 #include "../runtime/format-implementation.h"
 #include "../runtime/io-error.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstdarg>
 #include <cstring>
 #include <string>
diff --git a/flang/unittests/Runtime/hello.cpp b/flang/unittests/Runtime/hello.cpp
index 64ed2cbaba721a..22e7380128f358 100644
--- a/flang/unittests/Runtime/hello.cpp
+++ b/flang/unittests/Runtime/hello.cpp
@@ -3,7 +3,6 @@
 #include "testing.h"
 #include "../../runtime/descriptor.h"
 #include "../../runtime/io-api.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstring>
 
 using namespace Fortran::runtime;
diff --git a/flang/unittests/Runtime/list-input.cpp b/flang/unittests/Runtime/list-input.cpp
index 9f6377656f9133..c7a660dc87aae1 100644
--- a/flang/unittests/Runtime/list-input.cpp
+++ b/flang/unittests/Runtime/list-input.cpp
@@ -4,7 +4,6 @@
 #include "../../runtime/descriptor.h"
 #include "../../runtime/io-api.h"
 #include "../../runtime/io-error.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstring>
 
diff --git a/flang/unittests/Runtime/testing.cpp b/flang/unittests/Runtime/testing.cpp
index 8a31f23e9ef56b..146b37db9a5774 100644
--- a/flang/unittests/Runtime/testing.cpp
+++ b/flang/unittests/Runtime/testing.cpp
@@ -1,6 +1,5 @@
 #include "testing.h"
 #include "../../runtime/terminator.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdarg>
 #include <cstdio>
diff --git a/flang/unittests/Runtime/testing.h b/flang/unittests/Runtime/testing.h
index 943b6fd8d915e8..1b401aaf854336 100644
--- a/flang/unittests/Runtime/testing.h
+++ b/flang/unittests/Runtime/testing.h
@@ -1,8 +1,8 @@
 #ifndef FORTRAN_TEST_RUNTIME_TESTING_H_
 #define FORTRAN_TEST_RUNTIME_TESTING_H_
 
+#include "llvm/Support/raw_ostream.h"
 #include <cstddef>
-#include <iosfwd>
 
 namespace llvm {
 class raw_ostream;
diff --git a/libcxx/utils/libcxx/test/newformat.py b/libcxx/utils/libcxx/test/newformat.py
index d9d2a5073b408b..2ee6e502da6d68 100644
--- a/libcxx/utils/libcxx/test/newformat.py
+++ b/libcxx/utils/libcxx/test/newformat.py
@@ -13,6 +13,97 @@
 import re
 import subprocess
 
+def _supportsVerify(test):
+    """
+    Determine whether clang-verify is supported for that test.
+
+    This is done by checking whether the %{cxx} substitution supports certain
+    compiler flags.
+    """
+    command = "%{{cxx}} -xc++ {} -Werror -fsyntax-only -Xclang -verify-ignore-unexpected".format(os.devnull)
+    command = lit.TestRunner.applySubstitutions([command], test.config.substitutions,
+                                                recursion_limit=test.config.recursiveExpansionLimit)[0]
+    devNull = open(os.devnull, 'w')
+    result = subprocess.call(command, shell=True, stdout=devNull, stderr=devNull)
+    return result == 0
+
+def parseScript(test, preamble, fileDependencies):
+    """
+    Extract the script from a test, with substitutions applied.
+
+    Returns a list of commands ready to be executed.
+
+    - test
+        The lit.Test to parse.
+
+    - preamble
+        A list of commands to perform before any command in the test.
+        These commands can contain unexpanded substitutions, but they
+        must not be of the form 'RUN:' -- they must be proper commands
+        once substituted.
+
+    - fileDependencies
+        A list of additional file dependencies for the test.
+    """
+
+    # Get the default substitutions
+    tmpDir, tmpBase = lit.TestRunner.getTempPaths(test)
+    useExternalSh = True
+    substitutions = lit.TestRunner.getDefaultSubstitutions(test, tmpDir, tmpBase,
+                                                           normalize_slashes=useExternalSh)
+
+    # Add the %{build} and %{run} convenience substitutions
+    substitutions.append(('%{build}', '%{cxx} %s %{flags} %{compile_flags} %{link_flags} -o %t.exe'))
+    substitutions.append(('%{run}', '%{exec} %t.exe'))
+
+    # Add the %{verify} substitution and the verify-support feature if Clang-verify is supported
+    if _supportsVerify(test):
+        test.config.available_features.add('verify-support')
+        substitutions.append(('%{verify}', '-Xclang -verify -Xclang -verify-ignore-unexpected=note -ferror-limit=0'))
+
+    # Parse the test file, including custom directives
+    additionalCompileFlags = []
+    fileDependencies = list(fileDependencies)
+    parsers = [
+        lit.TestRunner.IntegratedTestKeywordParser('FILE_DEPENDENCIES:',
+                                                   lit.TestRunner.ParserKind.LIST,
+                                                   initial_value=fileDependencies),
+        lit.TestRunner.IntegratedTestKeywordParser('ADDITIONAL_COMPILE_FLAGS:',
+                                                   lit.TestRunner.ParserKind.LIST,
+                                                   initial_value=additionalCompileFlags)
+    ]
+
+    script = list(preamble)
+    parsed = lit.TestRunner.parseIntegratedTestScript(test, additional_parsers=parsers,
+                                                            require_script=not script)
+    if isinstance(parsed, lit.Test.Result):
+        return parsed
+    script += parsed
+
+    # Add compile flags specified with ADDITIONAL_COMPILE_FLAGS.
+    substitutions = [(s, x + ' ' + ' '.join(additionalCompileFlags)) if s == '%{compile_flags}'
+                            else (s, x) for (s, x) in substitutions]
+
+    # Perform substitutions inside FILE_DEPENDENCIES lines (or injected dependencies).
+    # This allows using variables like %t in file dependencies. Also note that we really
+    # need to resolve %{file_dependencies} now, because otherwise we won't be able to
+    # make all paths absolute below.
+    fileDependencies = lit.TestRunner.applySubstitutions(fileDependencies, substitutions,
+                                                         recursion_limit=test.config.recursiveExpansionLimit)
+
+    # Add the %{file_dependencies} substitution before we perform substitutions
+    # inside the script.
+    testDir = os.path.dirname(test.getSourcePath())
+    fileDependencies = [f if os.path.isabs(f) else os.path.join(testDir, f) for f in fileDependencies]
+    substitutions.append(('%{file_dependencies}', ' '.join(map(pipes.quote, fileDependencies))))
+
+    # Perform substitutions in the script itself.
+    script = lit.TestRunner.applySubstitutions(script, substitutions,
+                                               recursion_limit=test.config.recursiveExpansionLimit)
+
+    return script
+
+
 class CxxStandardLibraryTest(lit.formats.TestFormat):
     """
     Lit test format for the C++ Standard Library conformance test suite.
@@ -148,15 +239,6 @@ def _checkBaseSubstitutions(self, substitutions):
         for s in ['%{cxx}', '%{compile_flags}', '%{link_flags}', '%{flags}', '%{exec}']:
             assert s in substitutions, "Required substitution {} was not provided".format(s)
 
-    # Determine whether clang-verify is supported.
-    def _supportsVerify(self, test):
-        command = "%{{cxx}} -xc++ {} -Werror -fsyntax-only -Xclang -verify-ignore-unexpected".format(os.devnull)
-        command = lit.TestRunner.applySubstitutions([command], test.config.substitutions,
-                                                    recursion_limit=test.config.recursiveExpansionLimit)[0]
-        devNull = open(os.devnull, 'w')
-        result = subprocess.call(command, shell=True, stdout=devNull, stderr=devNull)
-        return result == 0
-
     def _disableWithModules(self, test):
         with open(test.getSourcePath(), 'rb') as f:
             contents = f.read()
@@ -225,7 +307,7 @@ def execute(self, test, litConfig):
         # otherwise it's like a .compile.fail.cpp test. This is only provided
         # for backwards compatibility with the test suite.
         elif filename.endswith('.fail.cpp'):
-            if self._supportsVerify(test):
+            if _supportsVerify(test):
                 steps = [
                     "%dbg(COMPILED WITH) %{cxx} %s %{flags} %{compile_flags} -fsyntax-only %{verify}"
                 ]
@@ -242,87 +324,11 @@ def addCompileFlags(self, config, *flags):
         string = ' '.join(flags)
         config.substitutions = [(s, x + ' ' + string) if s == '%{compile_flags}' else (s, x) for (s, x) in config.substitutions]
 
-    def _parseScript(self, test, preamble, fileDependencies):
-        """
-        Extract the script from a test, with substitutions applied.
-
-        Returns a list of commands ready to be executed.
-
-        - test
-            The lit.Test to parse.
-
-        - preamble
-            A list of commands to perform before any command in the test.
-            These commands can contain unexpanded substitutions, but they
-            must not be of the form 'RUN:' -- they must be proper commands
-            once substituted.
-
-        - fileDependencies
-            A list of additional file dependencies for the test.
-        """
-
-        # Get the default substitutions
-        tmpDir, tmpBase = lit.TestRunner.getTempPaths(test)
-        useExternalSh = True
-        substitutions = lit.TestRunner.getDefaultSubstitutions(test, tmpDir, tmpBase,
-                                                               normalize_slashes=useExternalSh)
-
-        # Add the %{build} and %{run} convenience substitutions
-        substitutions.append(('%{build}', '%{cxx} %s %{flags} %{compile_flags} %{link_flags} -o %t.exe'))
-        substitutions.append(('%{run}', '%{exec} %t.exe'))
-
-        # Add the %{verify} substitution and the verify-support feature if Clang-verify is supported
-        if self._supportsVerify(test):
-            test.config.available_features.add('verify-support')
-            substitutions.append(('%{verify}', '-Xclang -verify -Xclang -verify-ignore-unexpected=note -ferror-limit=0'))
-
-        # Parse the test file, including custom directives
-        additionalCompileFlags = []
-        fileDependencies = list(fileDependencies)
-        parsers = [
-            lit.TestRunner.IntegratedTestKeywordParser('FILE_DEPENDENCIES:',
-                                                       lit.TestRunner.ParserKind.LIST,
-                                                       initial_value=fileDependencies),
-            lit.TestRunner.IntegratedTestKeywordParser('ADDITIONAL_COMPILE_FLAGS:',
-                                                       lit.TestRunner.ParserKind.LIST,
-                                                       initial_value=additionalCompileFlags)
-        ]
-
-        script = list(preamble)
-        parsed = lit.TestRunner.parseIntegratedTestScript(test, additional_parsers=parsers,
-                                                                require_script=not script)
-        if isinstance(parsed, lit.Test.Result):
-            return parsed
-        script += parsed
-
-        # Add compile flags specified with ADDITIONAL_COMPILE_FLAGS.
-        substitutions = [(s, x + ' ' + ' '.join(additionalCompileFlags)) if s == '%{compile_flags}'
-                                else (s, x) for (s, x) in substitutions]
-
-        # Perform substitutions inside FILE_DEPENDENCIES lines (or injected dependencies).
-        # This allows using variables like %t in file dependencies. Also note that we really
-        # need to resolve %{file_dependencies} now, because otherwise we won't be able to
-        # make all paths absolute below.
-        fileDependencies = lit.TestRunner.applySubstitutions(fileDependencies, substitutions,
-                                                             recursion_limit=test.config.recursiveExpansionLimit)
-
-        # Add the %{file_dependencies} substitution before we perform substitutions
-        # inside the script.
-        testDir = os.path.dirname(test.getSourcePath())
-        fileDependencies = [f if os.path.isabs(f) else os.path.join(testDir, f) for f in fileDependencies]
-        substitutions.append(('%{file_dependencies}', ' '.join(map(pipes.quote, fileDependencies))))
-
-        # Perform substitutions in the script itself.
-        script = lit.TestRunner.applySubstitutions(script, substitutions,
-                                                   recursion_limit=test.config.recursiveExpansionLimit)
-
-        return script
-
     def _executeShTest(self, test, litConfig, steps, fileDependencies=None):
         if test.config.unsupported:
             return lit.Test.Result(lit.Test.UNSUPPORTED, 'Test is unsupported')
 
-        script = self._parseScript(test, steps, fileDependencies or [])
+        script = parseScript(test, steps, fileDependencies or [])
         if isinstance(script, lit.Test.Result):
             return script
 
diff --git a/lldb/include/lldb/Utility/XcodeSDK.h b/lldb/include/lldb/Utility/XcodeSDK.h
index 552c51c368443b..24ab5b1fdf7aef 100644
--- a/lldb/include/lldb/Utility/XcodeSDK.h
+++ b/lldb/include/lldb/Utility/XcodeSDK.h
@@ -22,6 +22,9 @@ class XcodeSDK {
 
 public:
   XcodeSDK() = default;
+  /// Initialize an XcodeSDK object with an SDK name. The SDK name is the last
+  /// directory component of a path one would pass to clang's -isysroot
+  /// parameter. For example, "MacOSX.10.14.sdk".
   XcodeSDK(std::string &&name) : m_name(std::move(name)) {}
   static XcodeSDK GetAnyMacOS() { return XcodeSDK("MacOSX.sdk"); }
 
@@ -38,7 +41,6 @@ class XcodeSDK {
     numSDKTypes,
     unknown = -1
   };
-  static llvm::StringRef GetNameForType(Type type);
 
   /// The merge function follows a strict order to maintain monotonicity:
   /// 1. SDK with the higher SDKType wins.
@@ -49,15 +51,27 @@ class XcodeSDK {
   XcodeSDK(const XcodeSDK&) = default;
   bool operator==(XcodeSDK other);
 
-  /// Return parsed SDK number, and SDK version number.
-  std::tuple<Type, llvm::VersionTuple> Parse() const;
+  /// A parsed SDK directory name.
+  struct Info {
+    Type type = unknown;
+    llvm::VersionTuple version;
+    bool internal = false;
+
+    Info() = default;
+    bool operator<(const Info &other) const;
+  };
+
+  /// Return parsed SDK type and version number.
+  Info Parse() const;
+  bool IsAppleInternalSDK() const;
   llvm::VersionTuple GetVersion() const;
   Type GetType() const;
   llvm::StringRef GetString() const;
 
   static bool SDKSupportsModules(Type type, llvm::VersionTuple version);
   static bool SDKSupportsModules(Type desired_type, const FileSpec &sdk_path);
-  static llvm::StringRef GetSDKNameForType(Type type);
+  /// Return the canonical SDK name, such as "macosx" for the macOS SDK.
+  static std::string GetCanonicalName(Info info);
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index c09339e8c67315..e495c752cb193c 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -298,37 +298,66 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
 }
 
 std::string HostInfoMacOSX::GetXcodeSDK(XcodeSDK sdk) {
-  std::string xcrun_cmd = "xcrun --show-sdk-path --sdk " +
-                          XcodeSDK::GetSDKNameForType(sdk.GetType()).str();
-  llvm::VersionTuple version = sdk.GetVersion();
-  if (!version.empty())
-    xcrun_cmd += version.getAsString();
-
-  int status = 0;
-  int signo = 0;
-  std::string output_str;
-  lldb_private::Status error =
-      Host::RunShellCommand(xcrun_cmd.c_str(), FileSpec(), &status, &signo,
-                            &output_str, std::chrono::seconds(15));
-
-  // Check that xcrun return something useful.
-  if (status != 0 || output_str.empty())
-    return {};
-
-  // Convert to a StringRef so we can manipulate the string without modifying
-  // the underlying data.
-  llvm::StringRef output(output_str);
-
-  // Remove any trailing newline characters.
-  output = output.rtrim();
+  XcodeSDK::Info info = sdk.Parse();
+  std::string sdk_name = XcodeSDK::GetCanonicalName(info);
+  auto find_sdk = [](std::string sdk_name) -> std::string {
+    std::string xcrun_cmd = "xcrun --show-sdk-path --sdk " + sdk_name;
+    int status = 0;
+    int signo = 0;
+    std::string output_str;
+    lldb_private::Status error =
+        Host::RunShellCommand(xcrun_cmd.c_str(), FileSpec(), &status, &signo,
+                              &output_str, std::chrono::seconds(15));
+
+    // Check that xcrun return something useful.
+    if (status != 0 || output_str.empty())
+      return {};
+
+    // Convert to a StringRef so we can manipulate the string without modifying
+    // the underlying data.
+    llvm::StringRef output(output_str);
+
+    // Remove any trailing newline characters.
+    output = output.rtrim();
+
+    // Strip any leading newline characters and everything before them.
+    const size_t last_newline = output.rfind('\n');
+    if (last_newline != llvm::StringRef::npos)
+      output = output.substr(last_newline + 1);
+
+    return output.str();
+  };
+
+  std::string path = find_sdk(sdk_name);
+  while (path.empty()) {
+    // Try an alternate spelling of the name ("macosx10.9internal").
+    if (info.type == XcodeSDK::Type::MacOSX && !info.version.empty() &&
+        info.internal) {
+      llvm::StringRef fixed(sdk_name);
+      if (fixed.consume_back(".internal"))
+        sdk_name = fixed.str() + "internal";
+      path = find_sdk(sdk_name);
+      if (!path.empty())
+        break;
+    }
+    Log *log = lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_HOST);
+    LLDB_LOGF(log, "Couldn't find SDK %s on host", sdk_name.c_str());
+
+    // Try without the version.
+    if (!info.version.empty()) {
+      info.version = {};
+      sdk_name = XcodeSDK::GetCanonicalName(info);
+      path = find_sdk(sdk_name);
+      if (!path.empty())
+        break;
+    }
 
-  // Strip any leading newline characters and everything before them.
-  const size_t last_newline = output.rfind('\n');
-  if (last_newline != llvm::StringRef::npos)
-    output = output.substr(last_newline + 1);
+    LLDB_LOGF(log, "Couldn't find any matching SDK on host");
+    return {};
+  }
 
   // Whatever is left in output should be a valid path.
-  if (!FileSystem::Instance().Exists(output))
+  if (!FileSystem::Instance().Exists(path))
     return {};
-  return output.str();
+  return path;
 }
diff --git a/lldb/source/Utility/XcodeSDK.cpp b/lldb/source/Utility/XcodeSDK.cpp
index 7ad0090f85e2d8..a34eac6b2c95b0 100644
--- a/lldb/source/Utility/XcodeSDK.cpp
+++ b/lldb/source/Utility/XcodeSDK.cpp
@@ -64,13 +64,24 @@ static llvm::VersionTuple ParseSDKVersion(llvm::StringRef &name) {
   return version;
 }
 
+static bool ParseAppleInternalSDK(llvm::StringRef &name) {
+  return name.consume_front("Internal.");
+}
+
+XcodeSDK::Info XcodeSDK::Parse() const {
+  XcodeSDK::Info info;
+  llvm::StringRef input(m_name);
+  info.type = ParseSDKName(input);
+  info.version = ParseSDKVersion(input);
+  info.internal = ParseAppleInternalSDK(input);
+  return info;
+}
 
-std::tuple<XcodeSDK::Type, llvm::VersionTuple> XcodeSDK::Parse() const {
+bool XcodeSDK::IsAppleInternalSDK() const {
   llvm::StringRef input(m_name);
-  XcodeSDK::Type sdk = ParseSDKName(input);
-  llvm::VersionTuple version = ParseSDKVersion(input);
-  return std::make_tuple<XcodeSDK::Type, llvm::VersionTuple>(
-      std::move(sdk), std::move(version));
+  ParseSDKName(input);
+  ParseSDKVersion(input);
+  return ParseAppleInternalSDK(input);
 }
 
 llvm::VersionTuple XcodeSDK::GetVersion() const {
@@ -86,37 +97,64 @@ XcodeSDK::Type XcodeSDK::GetType() const {
 
 llvm::StringRef XcodeSDK::GetString() const { return m_name; }
 
+bool XcodeSDK::Info::operator<(const Info &other) const {
+  return std::tie(type, version, internal) <
+         std::tie(other.type, other.version, other.internal);
+}
 void XcodeSDK::Merge(XcodeSDK other) {
   // The "bigger" SDK always wins.
-  if (Parse() < other.Parse())
+  auto l = Parse();
+  auto r = other.Parse();
+  if (l < r)
     *this = other;
+  else {
+    // The Internal flag always wins.
+    if (llvm::StringRef(m_name).endswith(".sdk"))
+      if (!l.internal && r.internal)
+        m_name =
+            m_name.substr(0, m_name.size() - 3) + std::string("Internal.sdk");
+  }
 }
 
-llvm::StringRef XcodeSDK::GetSDKNameForType(XcodeSDK::Type type) {
-  switch (type) {
+std::string XcodeSDK::GetCanonicalName(XcodeSDK::Info info) {
+  std::string name;
+  switch (info.type) {
   case MacOSX:
-    return "macosx";
+    name = "macosx";
+    break;
   case iPhoneSimulator:
-    return "iphonesimulator";
+    name = "iphonesimulator";
+    break;
   case iPhoneOS:
-    return "iphoneos";
+    name = "iphoneos";
+    break;
   case AppleTVSimulator:
-    return "appletvsimulator";
+    name = "appletvsimulator";
+    break;
   case AppleTVOS:
-    return "appletvos";
+    name = "appletvos";
+    break;
   case WatchSimulator:
-    return "watchsimulator";
+    name = "watchsimulator";
+    break;
   case watchOS:
-    return "watchos";
+    name = "watchos";
+    break;
   case bridgeOS:
-    return "bridgeos";
+    name = "bridgeos";
+    break;
   case Linux:
-    return "linux";
+    name = "linux";
+    break;
   case numSDKTypes:
   case unknown:
-    return "";
+    return {};
   }
-  llvm_unreachable("unhandled switch case");
+  if (!info.version.empty())
+    name += info.version.getAsString();
+  if (info.internal)
+    name += ".internal";
+  return name;
 }
 
 bool XcodeSDK::SDKSupportsModules(XcodeSDK::Type sdk_type,
@@ -147,12 +185,15 @@ bool XcodeSDK::SDKSupportsModules(XcodeSDK::Type desired_type,
     const llvm::StringRef sdk_name = last_path_component.GetStringRef();
 
     const std::string sdk_name_lower = sdk_name.lower();
-    const llvm::StringRef sdk_string = GetSDKNameForType(desired_type);
+    Info info;
+    info.type = desired_type;
+    const llvm::StringRef sdk_string = GetCanonicalName(info);
     if (!llvm::StringRef(sdk_name_lower).startswith(sdk_string))
       return false;
 
     auto version_part = sdk_name.drop_front(sdk_string.size());
     version_part.consume_back(".sdk");
+    version_part.consume_back(".Internal");
 
     llvm::VersionTuple version;
     if (version.tryParse(version_part))
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 670361787f1feb..56f181597b1825 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -592,57 +592,54 @@ int Driver::MainLoop() {
   bool quit_requested = false;
   bool stopped_for_crash = false;
   if ((commands_data != nullptr) && (commands_size != 0u)) {
-    bool success = true;
     FILE *commands_file =
         PrepareCommandsForSourcing(commands_data, commands_size);
-    if (commands_file != nullptr) {
-      m_debugger.SetInputFileHandle(commands_file, true);
-
-      // Set the debugger into Sync mode when running the command file.
-      // Otherwise command files
-      // that run the target won't run in a sensible way.
-      bool old_async = m_debugger.GetAsync();
-      m_debugger.SetAsync(false);
-      int num_errors = 0;
-
-      SBCommandInterpreterRunOptions options;
-      options.SetStopOnError(true);
-      if (m_option_data.m_batch)
-        options.SetStopOnCrash(true);
-
-      m_debugger.RunCommandInterpreter(handle_events, spawn_thread, options,
-                                       num_errors, quit_requested,
-                                       stopped_for_crash);
-
-      if (m_option_data.m_batch && stopped_for_crash &&
-          !m_option_data.m_after_crash_commands.empty()) {
-        SBStream crash_commands_stream;
-        WriteCommandsForSourcing(eCommandPlacementAfterCrash,
-                                 crash_commands_stream);
-        const char *crash_commands_data = crash_commands_stream.GetData();
-        const size_t crash_commands_size = crash_commands_stream.GetSize();
-        commands_file = PrepareCommandsForSourcing(crash_commands_data,
-                                                   crash_commands_size);
-        if (commands_file != nullptr) {
-          bool local_quit_requested;
-          bool local_stopped_for_crash;
-          m_debugger.SetInputFileHandle(commands_file, true);
-
-          m_debugger.RunCommandInterpreter(handle_events, spawn_thread, options,
-                                           num_errors, local_quit_requested,
-                                           local_stopped_for_crash);
-          if (local_quit_requested)
-            quit_requested = true;
-        }
-      }
-      m_debugger.SetAsync(old_async);
-    } else
-      success = false;
 
-    // Something went wrong with command pipe
-    if (!success) {
+    if (commands_file == nullptr) {
+      // We should have already printed an error in PrepareCommandsForSourcing.
       exit(1);
     }
+
+    m_debugger.SetInputFileHandle(commands_file, true);
+
+    // Set the debugger into Sync mode when running the command file.
+    // Otherwise command files
+    // that run the target won't run in a sensible way.
+    bool old_async = m_debugger.GetAsync();
+    m_debugger.SetAsync(false);
+    int num_errors = 0;
+
+    SBCommandInterpreterRunOptions options;
+    options.SetStopOnError(true);
+    if (m_option_data.m_batch)
+      options.SetStopOnCrash(true);
+
+    m_debugger.RunCommandInterpreter(handle_events, spawn_thread, options,
+                                     num_errors, quit_requested,
+                                     stopped_for_crash);
+
+    if (m_option_data.m_batch && stopped_for_crash &&
+        !m_option_data.m_after_crash_commands.empty()) {
+      SBStream crash_commands_stream;
+      WriteCommandsForSourcing(eCommandPlacementAfterCrash,
+                               crash_commands_stream);
+      const char *crash_commands_data = crash_commands_stream.GetData();
+      const size_t crash_commands_size = crash_commands_stream.GetSize();
+      commands_file =
+          PrepareCommandsForSourcing(crash_commands_data, crash_commands_size);
+      if (commands_file != nullptr) {
+        bool local_quit_requested;
+        bool local_stopped_for_crash;
+        m_debugger.SetInputFileHandle(commands_file, true);
+
+        m_debugger.RunCommandInterpreter(handle_events, spawn_thread, options,
+                                         num_errors, local_quit_requested,
+                                         local_stopped_for_crash);
+        if (local_quit_requested)
+          quit_requested = true;
+      }
+    }
+    m_debugger.SetAsync(old_async);
   }
 
   // Now set the input file handle to STDIN and run the command
diff --git a/lldb/unittests/Host/HostInfoTest.cpp b/lldb/unittests/Host/HostInfoTest.cpp
index ed4b7b5d39c001..d854426e489853 100644
--- a/lldb/unittests/Host/HostInfoTest.cpp
+++ b/lldb/unittests/Host/HostInfoTest.cpp
@@ -50,3 +50,13 @@ TEST_F(HostInfoTest, GetHostname) {
   std::string s("abc");
   EXPECT_TRUE(HostInfo::GetHostname(s));
 }
+
+#if defined(__APPLE__)
+TEST_F(HostInfoTest, GetXcodeSDK) {
+  EXPECT_FALSE(HostInfo::GetXcodeSDK(XcodeSDK("MacOSX.sdk")).empty());
+  // These are expected to fall back to an available version.
+  EXPECT_FALSE(HostInfo::GetXcodeSDK(XcodeSDK("MacOSX9999.sdk")).empty());
+  // This is expected to fail.
+  EXPECT_TRUE(HostInfo::GetXcodeSDK(XcodeSDK("CeciNestPasUnOS.sdk")).empty());
+}
+#endif
diff --git a/lldb/unittests/Utility/XcodeSDKTest.cpp b/lldb/unittests/Utility/XcodeSDKTest.cpp
index a316516a167588..95b909e700184d 100644
--- a/lldb/unittests/Utility/XcodeSDKTest.cpp
+++ b/lldb/unittests/Utility/XcodeSDKTest.cpp
@@ -30,6 +30,11 @@ TEST(XcodeSDKTest, ParseTest) {
   EXPECT_EQ(XcodeSDK("MacOSX.sdk").GetVersion(), llvm::VersionTuple());
   EXPECT_EQ(XcodeSDK("MacOSX10.9.sdk").GetVersion(), llvm::VersionTuple(10, 9));
   EXPECT_EQ(XcodeSDK("MacOSX10.15.4.sdk").GetVersion(), llvm::VersionTuple(10, 15));
+  EXPECT_EQ(XcodeSDK("MacOSX.sdk").IsAppleInternalSDK(), false);
+  EXPECT_EQ(XcodeSDK("MacOSX10.15.Internal.sdk").GetType(), XcodeSDK::MacOSX);
+  EXPECT_EQ(XcodeSDK("MacOSX10.15.Internal.sdk").GetVersion(),
+            llvm::VersionTuple(10, 15));
+  EXPECT_EQ(XcodeSDK("MacOSX10.15.Internal.sdk").IsAppleInternalSDK(), true);
   EXPECT_EQ(XcodeSDK().GetType(), XcodeSDK::unknown);
   EXPECT_EQ(XcodeSDK().GetVersion(), llvm::VersionTuple());
 }
@@ -46,6 +51,12 @@ TEST(XcodeSDKTest, MergeTest) {
   EXPECT_EQ(sdk.GetVersion(), llvm::VersionTuple(1, 1));
   sdk.Merge(XcodeSDK("WatchOS2.0.sdk"));
   EXPECT_EQ(sdk.GetVersion(), llvm::VersionTuple(2, 0));
+  sdk.Merge(XcodeSDK("WatchOS1.1.Internal.sdk"));
+  EXPECT_EQ(sdk.GetVersion(), llvm::VersionTuple(2, 0));
+  EXPECT_EQ(sdk.IsAppleInternalSDK(), true);
+  XcodeSDK empty;
+  empty.Merge(XcodeSDK("MacOSX10.14.Internal.sdk"));
+  EXPECT_EQ(empty.GetString(), llvm::StringRef("MacOSX10.14.Internal.sdk"));
 }
 
 TEST(XcodeSDKTest, SDKSupportsModules) {
@@ -55,6 +66,10 @@ TEST(XcodeSDKTest, SDKSupportsModules) {
       FileSpec(
           base +
           "iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator12.0.sdk")));
+  EXPECT_TRUE(XcodeSDK::SDKSupportsModules(
+      XcodeSDK::Type::iPhoneSimulator,
+      FileSpec(base + "iPhoneSimulator.platform/Developer/SDKs/"
+                      "iPhoneSimulator12.0.Internal.sdk")));
   EXPECT_FALSE(XcodeSDK::SDKSupportsModules(
       XcodeSDK::Type::iPhoneSimulator,
       FileSpec(
@@ -68,19 +83,65 @@ TEST(XcodeSDKTest, SDKSupportsModules) {
       FileSpec(base + "MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk")));
 }
 
-TEST(XcodeSDKTest, GetSDKNameForType) {
-  EXPECT_EQ("macosx", XcodeSDK::GetSDKNameForType(XcodeSDK::Type::MacOSX));
-  EXPECT_EQ("iphonesimulator",
-            XcodeSDK::GetSDKNameForType(XcodeSDK::Type::iPhoneSimulator));
-  EXPECT_EQ("iphoneos", XcodeSDK::GetSDKNameForType(XcodeSDK::Type::iPhoneOS));
-  EXPECT_EQ("appletvsimulator",
-            XcodeSDK::GetSDKNameForType(XcodeSDK::Type::AppleTVSimulator));
-  EXPECT_EQ("appletvos",
-            XcodeSDK::GetSDKNameForType(XcodeSDK::Type::AppleTVOS));
-  EXPECT_EQ("watchsimulator",
-            XcodeSDK::GetSDKNameForType(XcodeSDK::Type::WatchSimulator));
-  EXPECT_EQ("watchos", XcodeSDK::GetSDKNameForType(XcodeSDK::Type::watchOS));
-  EXPECT_EQ("linux", XcodeSDK::GetSDKNameForType(XcodeSDK::Type::Linux));
-  EXPECT_EQ("", XcodeSDK::GetSDKNameForType(XcodeSDK::Type::numSDKTypes));
-  EXPECT_EQ("", XcodeSDK::GetSDKNameForType(XcodeSDK::Type::unknown));
+TEST(XcodeSDKTest, GetCanonicalName) {
+  XcodeSDK::Info info;
+  info.type = XcodeSDK::Type::MacOSX;
+  EXPECT_EQ("macosx", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::iPhoneSimulator;
+  EXPECT_EQ("iphonesimulator", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::iPhoneOS;
+  EXPECT_EQ("iphoneos", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::AppleTVSimulator;
+  EXPECT_EQ("appletvsimulator", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::AppleTVOS;
+  EXPECT_EQ("appletvos", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::WatchSimulator;
+  EXPECT_EQ("watchsimulator", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::watchOS;
+  EXPECT_EQ("watchos", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::Linux;
+  EXPECT_EQ("linux", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::numSDKTypes;
+  EXPECT_EQ("", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::unknown;
+  EXPECT_EQ("", XcodeSDK::GetCanonicalName(info));
+
+  info.internal = true;
+  info.type = XcodeSDK::Type::MacOSX;
+  EXPECT_EQ("macosx.internal", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::iPhoneSimulator;
+  EXPECT_EQ("iphonesimulator.internal", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::iPhoneOS;
+  EXPECT_EQ("iphoneos.internal", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::AppleTVSimulator;
+  EXPECT_EQ("appletvsimulator.internal", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::AppleTVOS;
+  EXPECT_EQ("appletvos.internal", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::WatchSimulator;
+  EXPECT_EQ("watchsimulator.internal", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::watchOS;
+  EXPECT_EQ("watchos.internal", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::MacOSX;
+  info.version = llvm::VersionTuple(10, 9);
+  EXPECT_EQ("macosx10.9.internal", XcodeSDK::GetCanonicalName(info));
+
+  info.type = XcodeSDK::Type::iPhoneOS;
+  info.version = llvm::VersionTuple(7, 0);
+  EXPECT_EQ("iphoneos7.0.internal", XcodeSDK::GetCanonicalName(info));
 }
diff --git a/llvm/include/llvm/ADT/SmallBitVector.h b/llvm/include/llvm/ADT/SmallBitVector.h
index 14545e2b612d78..f570bac23ad517 100644
--- a/llvm/include/llvm/ADT/SmallBitVector.h
+++ b/llvm/include/llvm/ADT/SmallBitVector.h
@@ -287,11 +287,11 @@ class SmallBitVector {
   /// Returns -1 if the next unset bit is not found.
   int find_next_unset(unsigned Prev) const {
     if (isSmall()) {
-      ++Prev;
       uintptr_t Bits = getSmallBits();
       // Mask in previous bits.
-      uintptr_t Mask = (uintptr_t(1) << Prev) - 1;
-      Bits |= Mask;
+      Bits |= (uintptr_t(1) << (Prev + 1)) - 1;
+      // Mask in unused bits.
+      Bits |= ~uintptr_t(0) << getSmallSize();
 
       if (Bits == ~uintptr_t(0) || Prev + 1 >= getSmallSize())
         return -1;
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index b4aa47981adabc..ff9ec9cfa6b84b 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -280,7 +280,7 @@ class AsmPrinter : public MachineFunctionPass {
     const class Function *Fn;
     uint8_t Version;
 
-    void emit(int, MCStreamer *, const MCExpr *, const MCSymbol *) const;
+    void emit(int, MCStreamer *) const;
   };
 
   // All the sleds to be emitted.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 24a82c431a5f2a..21e3d93ded2298 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -725,7 +725,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // If the cast is marked as legal (or promote) then assume low cost.
     if (SrcLT.first == DstLT.first &&
         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
-      return 1;
+      return SrcLT.first;
 
     // Handle scalar conversions.
     if (!Src->isVectorTy() && !Dst->isVectorTy()) {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index e32de8888c06d7..4398377606636d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -141,6 +141,14 @@ class CallLowering {
                                       uint64_t Size, MachinePointerInfo &MPO,
                                       CCValAssign &VA) = 0;
 
+    /// An overload which takes an ArgInfo if additional information about
+    /// the arg is needed.
+    virtual void assignValueToAddress(const ArgInfo &Arg, Register Addr,
+                                      uint64_t Size, MachinePointerInfo &MPO,
+                                      CCValAssign &VA) {
+      assignValueToAddress(Arg.Regs[0], Addr, Size, MPO, VA);
+    }
+
     /// Handle custom values, which may be passed into one or more of \p VAs.
     /// \return The number of \p VAs that have been assigned after the first
     ///         one, and which should therefore be skipped from further
@@ -152,7 +160,10 @@ class CallLowering {
       llvm_unreachable("Custom values not supported");
     }
 
-    Register extendRegister(Register ValReg, CCValAssign &VA);
+    /// Extend a register to the location type given in VA, capped at extending
+    /// to at most MaxSize bits. If MaxSizeBits is 0 then no maximum is set.
+    Register extendRegister(Register ValReg, CCValAssign &VA,
+                            unsigned MaxSizeBits = 0);
 
     virtual bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                            CCValAssign::LocInfo LocInfo, const ArgInfo &Info,
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index 754b7e83135e53..48d00fab2fa94d 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -38,7 +38,7 @@ class MCObjectStreamer : public MCStreamer {
   bool EmitEHFrame;
   bool EmitDebugFrame;
   SmallVector<MCSymbol *, 2> PendingLabels;
-  SmallPtrSet<MCSection *, 4> PendingLabelSections;
+  SmallVector<MCSection*, 2> PendingLabelSections;
   unsigned CurSubsectionIdx;
   struct PendingMCFixup {
     const MCSymbol *Sym;
diff --git a/llvm/include/llvm/XRay/InstrumentationMap.h b/llvm/include/llvm/XRay/InstrumentationMap.h
index 5cbe5c44893b21..aae90345cbb749 100644
--- a/llvm/include/llvm/XRay/InstrumentationMap.h
+++ b/llvm/include/llvm/XRay/InstrumentationMap.h
@@ -50,6 +50,8 @@ struct SledEntry {
 
   /// Whether the sled was annotated to always be instrumented.
   bool AlwaysInstrument;
+
+  unsigned char Version;
 };
 
 struct YAMLXRaySledEntry {
@@ -59,6 +61,7 @@ struct YAMLXRaySledEntry {
   SledEntry::FunctionKinds Kind;
   bool AlwaysInstrument;
   std::string FunctionName;
+  unsigned char Version;
 };
 
 /// The InstrumentationMap represents the computed function id's and indicated
@@ -120,6 +123,7 @@ template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
     IO.mapRequired("kind", Entry.Kind);
     IO.mapRequired("always-instrument", Entry.AlwaysInstrument);
     IO.mapOptional("function-name", Entry.FunctionName);
+    IO.mapOptional("version", Entry.Version, 0);
   }
 
   static constexpr bool flow = true;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 7b9bcb636861c9..0d31a9db415458 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1767,6 +1767,8 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   CurExceptionSym = nullptr;
   bool NeedsLocalForSize = MAI->needsLocalForSize();
   if (F.hasFnAttribute("patchable-function-entry") ||
+      F.hasFnAttribute("function-instrument") ||
+      F.hasFnAttribute("xray-instruction-threshold") ||
       needFuncLabelsForEHOrDebugInfo(MF, MMI) || NeedsLocalForSize ||
       MF.getTarget().Options.EmitStackSizeSection) {
     CurrentFnBegin = createTempSymbol("func_begin");
@@ -3174,14 +3176,7 @@ void AsmPrinterHandler::markFunctionEnd() {}
 // In the binary's "xray_instr_map" section, an array of these function entries
 // describes each instrumentation point.  When XRay patches your code, the index
 // into this table will be given to your handler as a patch point identifier.
-void AsmPrinter::XRayFunctionEntry::emit(int Bytes, MCStreamer *Out,
-                                         const MCExpr *Location,
-                                         const MCSymbol *CurrentFnSym) const {
-  if (Location)
-    Out->emitValueImpl(Location, Bytes);
-  else
-    Out->emitSymbolValue(Sled, Bytes);
-  Out->emitSymbolValue(CurrentFnSym, Bytes);
+void AsmPrinter::XRayFunctionEntry::emit(int Bytes, MCStreamer *Out) const {
   auto Kind8 = static_cast<uint8_t>(Kind);
   Out->emitBinaryData(StringRef(reinterpret_cast<const char *>(&Kind8), 1));
   Out->emitBinaryData(
@@ -3234,19 +3229,31 @@ void AsmPrinter::emitXRayTable() {
   // Now we switch to the instrumentation map section. Because this is done
   // per-function, we are able to create an index entry that will represent the
   // range of sleds associated with a function.
+  auto &Ctx = OutContext;
   MCSymbol *SledsStart = OutContext.createTempSymbol("xray_sleds_start", true);
   OutStreamer->SwitchSection(InstMap);
   OutStreamer->emitLabel(SledsStart);
   for (const auto &Sled : Sleds) {
-    const MCExpr *Location = nullptr;
     if (PCRel) {
-      MCSymbol *Dot = OutContext.createTempSymbol();
+      MCSymbol *Dot = Ctx.createTempSymbol();
       OutStreamer->emitLabel(Dot);
-      Location = MCBinaryExpr::createSub(
-          MCSymbolRefExpr::create(Sled.Sled, OutContext),
-          MCSymbolRefExpr::create(Dot, OutContext), OutContext);
+      OutStreamer->emitValueImpl(
+          MCBinaryExpr::createSub(MCSymbolRefExpr::create(Sled.Sled, Ctx),
+                                  MCSymbolRefExpr::create(Dot, Ctx), Ctx),
+          WordSizeBytes);
+      OutStreamer->emitValueImpl(
+          MCBinaryExpr::createSub(
+              MCSymbolRefExpr::create(CurrentFnBegin, Ctx),
+              MCBinaryExpr::createAdd(
+                  MCSymbolRefExpr::create(Dot, Ctx),
+                  MCConstantExpr::create(WordSizeBytes, Ctx), Ctx),
+              Ctx),
+          WordSizeBytes);
+    } else {
+      OutStreamer->emitSymbolValue(Sled.Sled, WordSizeBytes);
+      OutStreamer->emitSymbolValue(CurrentFnSym, WordSizeBytes);
     }
-    Sled.emit(WordSizeBytes, OutStreamer.get(), Location, CurrentFnSym);
+    Sled.emit(WordSizeBytes, OutStreamer.get());
   }
   MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_sleds_end", true);
   OutStreamer->emitLabel(SledsEnd);
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 2162b2f041b4b9..70bb272796739e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -373,7 +373,7 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
       unsigned Offset = VA.getLocMemOffset();
       MachinePointerInfo MPO;
       Register StackAddr = Handler.getStackAddress(Size, Offset, MPO);
-      Handler.assignValueToAddress(ArgReg, StackAddr, Size, MPO, VA);
+      Handler.assignValueToAddress(Args[i], StackAddr, Size, MPO, VA);
     } else {
       // FIXME: Support byvals and other weirdness
       return false;
@@ -458,10 +458,19 @@ bool CallLowering::resultsCompatible(CallLoweringInfo &Info,
 }
 
 Register CallLowering::ValueHandler::extendRegister(Register ValReg,
-                                                    CCValAssign &VA) {
+                                                    CCValAssign &VA,
+                                                    unsigned MaxSizeBits) {
   LLT LocTy{VA.getLocVT()};
-  if (LocTy.getSizeInBits() == MRI.getType(ValReg).getSizeInBits())
+  LLT ValTy = MRI.getType(ValReg);
+  if (LocTy.getSizeInBits() == ValTy.getSizeInBits())
     return ValReg;
+
+  if (LocTy.isScalar() && MaxSizeBits && MaxSizeBits < LocTy.getSizeInBits()) {
+    if (MaxSizeBits <= ValTy.getSizeInBits())
+      return ValReg;
+    LocTy = LLT::scalar(MaxSizeBits);
+  }
+
   switch (VA.getLocInfo()) {
   default: break;
   case CCValAssign::Full:
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 8ee85c6229b65e..7afa61f2c4dbd4 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1220,6 +1220,10 @@ bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
   if (!mayStore() && !Other.mayStore())
     return false;
 
+  // Both instructions must be memory operations to be able to alias.
+  if (!mayLoadOrStore() || !Other.mayLoadOrStore())
+    return false;
+
   // Let the target decide if memory accesses cannot possibly overlap.
   if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
     return false;
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index fd4422fc368cec..5bb8e76713fbeb 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -59,8 +59,12 @@ void MCObjectStreamer::addPendingLabel(MCSymbol* S) {
     CurSection->addPendingLabel(S, CurSubsectionIdx);
 
     // Add this Section to the list of PendingLabelSections.
-    PendingLabelSections.insert(CurSection);
-  } else
+    auto SecIt = std::find(PendingLabelSections.begin(),
+                           PendingLabelSections.end(), CurSection);
+    if (SecIt == PendingLabelSections.end())
+      PendingLabelSections.push_back(CurSection);
+  }
+  else
     // There is no Section / Subsection for this label yet.
     PendingLabels.push_back(S);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
index b15bbe106f17b2..f6daa5f7f9e2f2 100644
--- a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -171,17 +171,33 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    if (VA.getLocInfo() == CCValAssign::LocInfo::AExt) {
-      Size = VA.getLocVT().getSizeInBits() / 8;
-      ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(Size * 8), ValVReg)
-                    .getReg(0);
-    }
     MachineFunction &MF = MIRBuilder.getMF();
     auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size,
                                        inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
   }
 
+  void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
+                            uint64_t Size, MachinePointerInfo &MPO,
+                            CCValAssign &VA) override {
+    unsigned MaxSize = Size * 8;
+    // For varargs, we always want to extend them to 8 bytes, in which case
+    // we disable setting a max.
+    if (!Arg.IsFixed)
+      MaxSize = 0;
+
+    Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
+                           ? extendRegister(Arg.Regs[0], VA, MaxSize)
+                           : Arg.Regs[0];
+
+    // If we extended we might need to adjust the MMO's Size.
+    const LLT RegTy = MRI.getType(ValVReg);
+    if (RegTy.getSizeInBytes() > Size)
+      Size = RegTy.getSizeInBytes();
+
+    assignValueToAddress(ValVReg, Addr, Size, MPO, VA);
+  }
+
   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                  CCValAssign::LocInfo LocInfo,
                  const CallLowering::ArgInfo &Info,
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 25237bf50ddeb7..f8709bc8bc4b18 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1144,24 +1144,11 @@ static int alignTo(int Num, int PowOf2) {
   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
 }
 
-static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
-                     AliasAnalysis *AA) {
-  // One of the instructions must modify memory.
-  if (!MIa.mayStore() && !MIb.mayStore())
-    return false;
-
-  // Both instructions must be memory operations.
-  if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
-    return false;
-
-  return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
-}
-
 static bool mayAlias(MachineInstr &MIa,
                      SmallVectorImpl<MachineInstr *> &MemInsns,
                      AliasAnalysis *AA) {
   for (MachineInstr *MIb : MemInsns)
-    if (mayAlias(MIa, *MIb, AA))
+    if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
       return true;
 
   return false;
@@ -1219,7 +1206,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
       return false;
 
     // If we encounter a store aliased with the load, return early.
-    if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
+    if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
       return false;
   } while (MBBI != B && Count < Limit);
   return false;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 924296a2ff33b8..0a16a61203c8bd 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -265,8 +265,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
   void addInstToMergeableList(const CombineInfo &CI,
                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
-  bool collectMergeableInsts(MachineBasicBlock &MBB,
-                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
+
+  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
+      MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+      MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
+      std::list<std::list<CombineInfo>> &MergeableInsts) const;
 
 public:
   static char ID;
@@ -1944,31 +1947,38 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
   MergeableInsts.emplace_back(1, CI);
 }
 
-bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
-                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
+std::pair<MachineBasicBlock::iterator, bool>
+SILoadStoreOptimizer::collectMergeableInsts(
+    MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+    MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
+    std::list<std::list<CombineInfo>> &MergeableInsts) const {
   bool Modified = false;
-  // Contain the list
-  MemInfoMap Visited;
-  // Contains the list of instructions for which constant offsets are being
-  // promoted to the IMM.
-  SmallPtrSet<MachineInstr *, 4> AnchorList;
 
   // Sort potential mergeable instructions into lists.  One list per base address.
   unsigned Order = 0;
-  for (MachineInstr &MI : MBB.instrs()) {
+  MachineBasicBlock::iterator BlockI = Begin;
+  for (; BlockI != End; ++BlockI) {
+    MachineInstr &MI = *BlockI;
+
     // We run this before checking if an address is mergeable, because it can produce
     // better code even if the instructions aren't mergeable.
     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
       Modified = true;
 
+    // Don't combine if volatile. We also won't be able to merge across this, so
+    // break the search. We can look after this barrier for separate merges.
+    if (MI.hasOrderedMemoryRef()) {
+      LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
+
+      // Search will resume after this instruction in a separate merge list.
+      ++BlockI;
+      break;
+    }
+
     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
     if (InstClass == UNKNOWN)
       continue;
 
-    // Don't combine if volatile.
-    if (MI.hasOrderedMemoryRef())
-      continue;
-
     CombineInfo CI;
     CI.setMI(MI, *TII, *STM);
     CI.Order = Order++;
@@ -2008,7 +2018,7 @@ bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
     ++I;
   }
 
-  return Modified;
+  return std::make_pair(BlockI, Modified);
 }
 
 // Scan through looking for adjacent LDS operations with constant offsets from
@@ -2161,15 +2171,33 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   bool Modified = false;
 
+  // Contains the list of instructions for which constant offsets are being
+  // promoted to the IMM. This is tracked for an entire block at time.
+  SmallPtrSet<MachineInstr *, 4> AnchorList;
+  MemInfoMap Visited;
 
   for (MachineBasicBlock &MBB : MF) {
-    std::list<std::list<CombineInfo> > MergeableInsts;
-    // First pass: Collect list of all instructions we know how to merge.
-    Modified |= collectMergeableInsts(MBB, MergeableInsts);
-    do {
-      OptimizeAgain = false;
-      Modified |= optimizeBlock(MergeableInsts);
-    } while (OptimizeAgain);
+    MachineBasicBlock::iterator SectionEnd;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
+         I = SectionEnd) {
+      bool CollectModified;
+      std::list<std::list<CombineInfo>> MergeableInsts;
+
+      // First pass: Collect list of all instructions we know how to merge in a
+      // subset of the block.
+      std::tie(SectionEnd, CollectModified) =
+          collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
+
+      Modified |= CollectModified;
+
+      do {
+        OptimizeAgain = false;
+        Modified |= optimizeBlock(MergeableInsts);
+      } while (OptimizeAgain);
+    }
+
+    Visited.clear();
+    AnchorList.clear();
   }
 
   return Modified;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 28b175a88ad3b8..1f75dd184cf058 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -588,6 +588,18 @@ def SFENCE_VMA : RVInstR<0b0001001, 0b000, OPC_SYSTEM, (outs),
   let rd = 0;
 }
 
+//===----------------------------------------------------------------------===//
+// Debug instructions
+//===----------------------------------------------------------------------===//
+
+let isBarrier = 1, isReturn = 1, isTerminator = 1 in {
+def DRET : Priv<"dret", 0b0111101>, Sched<[]> {
+  let rd = 0;
+  let rs1 = 0;
+  let rs2 = 0b10010;
+}
+} // isBarrier = 1, isReturn = 1, isTerminator = 1
+
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 2bc5fe90f84cf1..be5a375a2cfa1e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -202,10 +202,23 @@ class SimplifyCFGOpt {
   bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder);
   bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
   bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool SimplifyCondBranchToTwoReturns(BranchInst *BI, IRBuilder<> &Builder);
 
   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
                                              IRBuilder<> &Builder);
 
+  bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI);
+  bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                              const TargetTransformInfo &TTI);
+  bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
+                                  BasicBlock *TrueBB, BasicBlock *FalseBB,
+                                  uint32_t TrueWeight, uint32_t FalseWeight);
+  bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
+                                 const DataLayout &DL);
+  bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select);
+  bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI);
+  bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder);
+
 public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
                  SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
@@ -1235,8 +1248,8 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I);
 /// Given a conditional branch that goes to BB1 and BB2, hoist any common code
 /// in the two blocks up into the branch block. The caller of this function
 /// guarantees that BI's block dominates BB1 and BB2.
-static bool HoistThenElseCodeToIf(BranchInst *BI,
-                                  const TargetTransformInfo &TTI) {
+bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
+                                           const TargetTransformInfo &TTI) {
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks.  In particular, we don't want to get into
   // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
@@ -1969,8 +1982,8 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
 /// \endcode
 ///
 /// \returns true if the conditional block is removed.
-static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
-                                   const TargetTransformInfo &TTI) {
+bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                                            const TargetTransformInfo &TTI) {
   // Be conservative for now. FP select instruction can often be expensive.
   Value *BrCond = BI->getCondition();
   if (isa<FCmpInst>(BrCond))
@@ -2462,8 +2475,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 /// If we found a conditional branch that goes to two returning blocks,
 /// try to merge them together into one return,
 /// introducing a select if the return values disagree.
-static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
-                                           IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
+                                                    IRBuilder<> &Builder) {
   assert(BI->isConditional() && "Must be a conditional branch");
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
@@ -3522,10 +3535,11 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 // Takes care of updating the successors and removing the old terminator.
 // Also makes sure not to introduce new successors by assuming that edges to
 // non-successor TrueBBs and FalseBBs aren't reachable.
-static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
-                                       BasicBlock *TrueBB, BasicBlock *FalseBB,
-                                       uint32_t TrueWeight,
-                                       uint32_t FalseWeight) {
+bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
+                                                Value *Cond, BasicBlock *TrueBB,
+                                                BasicBlock *FalseBB,
+                                                uint32_t TrueWeight,
+                                                uint32_t FalseWeight) {
   // Remove any superfluous successor edges from the CFG.
   // First, figure out which successors to preserve.
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
@@ -3585,7 +3599,8 @@ static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
 //   (switch (select cond, X, Y)) on constant X, Y
 // with a branch - conditional if X and Y lead to distinct BBs,
 // unconditional otherwise.
-static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
+bool SimplifyCFGOpt::SimplifySwitchOnSelect(SwitchInst *SI,
+                                            SelectInst *Select) {
   // Check for constant integer values in the select.
   ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue());
   ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue());
@@ -3621,7 +3636,8 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
 //                             blockaddress(@fn, BlockB)))
 // with
 //   (br cond, BlockA, BlockB).
-static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
+bool SimplifyCFGOpt::SimplifyIndirectBrOnSelect(IndirectBrInst *IBI,
+                                                SelectInst *SI) {
   // Check that both operands of the select are block addresses.
   BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue());
   BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue());
@@ -3756,8 +3772,9 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
 /// The specified branch is a conditional branch.
 /// Check to see if it is branching on an or/and chain of icmp instructions, and
 /// fold it into a switch instruction if so.
-static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
-                                      const DataLayout &DL) {
+bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
+                                               IRBuilder<> &Builder,
+                                               const DataLayout &DL) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (!Cond)
     return false;
@@ -4407,7 +4424,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch) {
 
 /// Turn a switch with two reachable destinations into an integer range
 /// comparison and branch.
-static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
+                                             IRBuilder<> &Builder) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   bool HasDefault =
diff --git a/llvm/lib/XRay/InstrumentationMap.cpp b/llvm/lib/XRay/InstrumentationMap.cpp
index 1e9b69a5f9dccc..b095d7134a5fef 100644
--- a/llvm/lib/XRay/InstrumentationMap.cpp
+++ b/llvm/lib/XRay/InstrumentationMap.cpp
@@ -68,10 +68,13 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
 
   StringRef Contents = "";
   const auto &Sections = ObjFile.getBinary()->sections();
+  uint64_t Address = 0;
   auto I = llvm::find_if(Sections, [&](object::SectionRef Section) {
     Expected<StringRef> NameOrErr = Section.getName();
-    if (NameOrErr)
+    if (NameOrErr) {
+      Address = Section.getAddress();
       return *NameOrErr == "xray_instr_map";
+    }
     consumeError(NameOrErr.takeError());
     return false;
   });
@@ -141,6 +144,7 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
     return Address;
   };
 
+  const int WordSize = 8;
   int32_t FuncId = 1;
   uint64_t CurFn = 0;
   for (; C != Contents.bytes_end(); C += ELF64SledEntrySize) {
@@ -165,6 +169,11 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
           std::make_error_code(std::errc::executable_format_error));
     Entry.Kind = Kinds[Kind];
     Entry.AlwaysInstrument = Extractor.getU8(&OffsetPtr) != 0;
+    Entry.Version = Extractor.getU8(&OffsetPtr);
+    if (Entry.Version >= 2) {
+      Entry.Address += C - Contents.bytes_begin() + Address;
+      Entry.Function += C - Contents.bytes_begin() + WordSize + Address;
+    }
 
     // We do replicate the function id generation scheme implemented in the
     // XRay runtime.
@@ -209,8 +218,8 @@ loadYAML(sys::fs::file_t Fd, size_t FileSize, StringRef Filename,
   for (const auto &Y : YAMLSleds) {
     FunctionAddresses[Y.FuncId] = Y.Function;
     FunctionIds[Y.Function] = Y.FuncId;
-    Sleds.push_back(
-        SledEntry{Y.Address, Y.Function, Y.Kind, Y.AlwaysInstrument});
+    Sleds.push_back(SledEntry{Y.Address, Y.Function, Y.Kind, Y.AlwaysInstrument,
+                              Y.Version});
   }
   return Error::success();
 }
diff --git a/llvm/test/Analysis/CostModel/ARM/cast.ll b/llvm/test/Analysis/CostModel/ARM/cast.ll
index 5d857bc2e80280..edfe281ff53416 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast.ll
@@ -424,8 +424,8 @@ define i32 @casts() {
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
@@ -444,8 +444,8 @@ define i32 @casts() {
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
@@ -504,8 +504,8 @@ define i32 @casts() {
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
@@ -524,8 +524,8 @@ define i32 @casts() {
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
@@ -611,16 +611,16 @@ define i32 @casts() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
@@ -878,16 +878,16 @@ define i32 @casts() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
@@ -1738,14 +1738,14 @@ define i32 @load_extends() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
@@ -1774,14 +1774,14 @@ define i32 @load_extends() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
diff --git a/llvm/test/Analysis/CostModel/X86/fptosi.ll b/llvm/test/Analysis/CostModel/X86/fptosi.ll
index a230648eb5833d..b9f01b55501156 100644
--- a/llvm/test/Analysis/CostModel/X86/fptosi.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptosi.ll
@@ -219,12 +219,26 @@ define i32 @fptosi_float_i64(i32 %arg) {
 }
 
 define i32 @fptosi_float_i32(i32 %arg) {
-; CHECK-LABEL: 'fptosi_float_i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'fptosi_float_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptosi_float_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptosi_float_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I32 = fptosi float undef to i32
   %V4I32 = fptosi <4 x float> undef to <4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll
index f06cd14abe753f..eec73695808199 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll
@@ -1,31 +1,101 @@
-; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-apple-ios9.0"
 
-; CHECK-LABEL: name: test_varargs
-; CHECK: [[ANSWER:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
-; CHECK: [[D_ONE:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
-; CHECK: [[TWELVE:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-; CHECK: [[THREE:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
-; CHECK: [[ONE:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-; CHECK: [[FOUR:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-; CHECK: [[F_ONE:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-; CHECK: [[TWO:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
-
-; CHECK: $w0 = COPY [[ANSWER]]
-; CHECK: $d0 = COPY [[D_ONE]]
-; CHECK: $x1 = COPY [[TWELVE]]
-; CHECK: [[THREE_EXT:%[0-9]+]]:_(s64) = G_ANYEXT [[THREE]]
-; CHECK: G_STORE [[THREE_EXT]](s64), {{%[0-9]+}}(p0) :: (store 8 into stack, align 1)
-; CHECK: [[ONE_EXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ONE]]
-; CHECK: G_STORE [[ONE_EXT]](s64), {{%[0-9]+}}(p0) :: (store 8 into stack + 8, align 1)
-; CHECK: [[FOUR_EXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FOUR]]
-; CHECK: G_STORE [[FOUR_EXT]](s64), {{%[0-9]+}}(p0) :: (store 8 into stack + 16, align 1)
-; CHECK: G_STORE [[F_ONE]](s32), {{%[0-9]+}}(p0) :: (store 4 into stack + 24, align 1)
-; CHECK: G_STORE [[TWO]](s64), {{%[0-9]+}}(p0) :: (store 8 into stack + 32, align 1)
 declare void @varargs(i32, double, i64, ...)
 define void @test_varargs() {
+  ; CHECK-LABEL: name: test_varargs
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+  ; CHECK:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+  ; CHECK:   [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
+  ; CHECK:   [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+  ; CHECK:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+  ; CHECK:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; CHECK:   [[C7:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
+  ; CHECK:   ADJCALLSTACKDOWN 40, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $w0 = COPY [[C]](s32)
+  ; CHECK:   $d0 = COPY [[C1]](s64)
+  ; CHECK:   $x1 = COPY [[C2]](s64)
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $sp
+  ; CHECK:   [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s8)
+  ; CHECK:   G_STORE [[ANYEXT]](s64), [[PTR_ADD]](p0) :: (store 8 into stack, align 1)
+  ; CHECK:   [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64)
+  ; CHECK:   [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s16)
+  ; CHECK:   G_STORE [[ANYEXT1]](s64), [[PTR_ADD1]](p0) :: (store 8 into stack + 8, align 1)
+  ; CHECK:   [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; CHECK:   [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C10]](s64)
+  ; CHECK:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32)
+  ; CHECK:   G_STORE [[ANYEXT2]](s64), [[PTR_ADD2]](p0) :: (store 8 into stack + 16, align 1)
+  ; CHECK:   [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+  ; CHECK:   [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C11]](s64)
+  ; CHECK:   G_STORE [[C6]](s32), [[PTR_ADD3]](p0) :: (store 4 into stack + 24, align 1)
+  ; CHECK:   [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+  ; CHECK:   [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C12]](s64)
+  ; CHECK:   G_STORE [[C7]](s64), [[PTR_ADD4]](p0) :: (store 8 into stack + 32, align 1)
+  ; CHECK:   BL @varargs, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $w0, implicit $d0, implicit $x1
+  ; CHECK:   ADJCALLSTACKUP 40, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   RET_ReallyLR
   call void(i32, double, i64, ...) @varargs(i32 42, double 1.0, i64 12, i8 3, i16 1, i32 4, float 1.0, double 2.0)
   ret void
 }
+
+declare i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline
+
+define i32 @i8i16caller() nounwind readnone {
+  ; CHECK-LABEL: name: i8i16caller
+  ; CHECK: bb.1.entry:
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+  ; CHECK:   [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
+  ; CHECK:   [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
+  ; CHECK:   [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+  ; CHECK:   [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+  ; CHECK:   [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+  ; CHECK:   [[C8:%[0-9]+]]:_(s8) = G_CONSTANT i8 97
+  ; CHECK:   [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 98
+  ; CHECK:   [[C10:%[0-9]+]]:_(s8) = G_CONSTANT i8 99
+  ; CHECK:   [[C11:%[0-9]+]]:_(s8) = G_CONSTANT i8 100
+  ; CHECK:   ADJCALLSTACKDOWN 6, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $x0 = COPY [[C]](s64)
+  ; CHECK:   $x1 = COPY [[C1]](s64)
+  ; CHECK:   $x2 = COPY [[C2]](s64)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[C3]](s8)
+  ; CHECK:   $w3 = COPY [[SEXT]](s32)
+  ; CHECK:   [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[C4]](s16)
+  ; CHECK:   $w4 = COPY [[SEXT1]](s32)
+  ; CHECK:   $x5 = COPY [[C5]](s64)
+  ; CHECK:   $x6 = COPY [[C6]](s64)
+  ; CHECK:   $x7 = COPY [[C7]](s64)
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $sp
+  ; CHECK:   [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C12]](s64)
+  ; CHECK:   G_STORE [[C8]](s8), [[PTR_ADD]](p0) :: (store 1 into stack)
+  ; CHECK:   [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C13]](s64)
+  ; CHECK:   G_STORE [[C9]](s16), [[PTR_ADD1]](p0) :: (store 2 into stack + 2, align 1)
+  ; CHECK:   [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C14]](s64)
+  ; CHECK:   G_STORE [[C10]](s8), [[PTR_ADD2]](p0) :: (store 1 into stack + 4)
+  ; CHECK:   [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+  ; CHECK:   [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C15]](s64)
+  ; CHECK:   G_STORE [[C11]](s8), [[PTR_ADD3]](p0) :: (store 1 into stack + 5)
+  ; CHECK:   BL @i8i16callee, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $w3, implicit $w4, implicit $x5, implicit $x6, implicit $x7, implicit-def $x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+  ; CHECK:   ADJCALLSTACKUP 6, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+  ; CHECK:   $w0 = COPY [[TRUNC]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+entry:
+  %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 97, i16  98, i8  99, i8  100)
+  %conv = trunc i64 %call to i32
+  ret i32 %conv
+}
+
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
index 21cc3f82afdf17..cb3dd9a1809712 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
@@ -1,84 +1,100 @@
-; RUN: llc -O0 -stop-after=irtranslator -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -O0 -stop-after=irtranslator -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-linux-gnu"
 
-; CHECK-LABEL: name: args_i32
-; CHECK: %[[ARG0:[0-9]+]]:_(s32) = COPY $w0
-; CHECK: %{{[0-9]+}}:_(s32) = COPY $w1
-; CHECK: %{{[0-9]+}}:_(s32) = COPY $w2
-; CHECK: %{{[0-9]+}}:_(s32) = COPY $w3
-; CHECK: %{{[0-9]+}}:_(s32) = COPY $w4
-; CHECK: %{{[0-9]+}}:_(s32) = COPY $w5
-; CHECK: %{{[0-9]+}}:_(s32) = COPY $w6
-; CHECK: %{{[0-9]+}}:_(s32) = COPY $w7
-; CHECK: $w0 = COPY %[[ARG0]]
-
 define i32 @args_i32(i32 %w0, i32 %w1, i32 %w2, i32 %w3,
+  ; CHECK-LABEL: name: args_i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $w0, $w1, $w2, $w3, $w4, $w5, $w6, $w7
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $w4
+  ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $w5
+  ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY $w6
+  ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY $w7
+  ; CHECK:   $w0 = COPY [[COPY]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
                      i32 %w4, i32 %w5, i32 %w6, i32 %w7) {
   ret i32 %w0
 }
 
-; CHECK-LABEL: name: args_i64
-; CHECK: %[[ARG0:[0-9]+]]:_(s64) = COPY $x0
-; CHECK: %{{[0-9]+}}:_(s64) = COPY $x1
-; CHECK: %{{[0-9]+}}:_(s64) = COPY $x2
-; CHECK: %{{[0-9]+}}:_(s64) = COPY $x3
-; CHECK: %{{[0-9]+}}:_(s64) = COPY $x4
-; CHECK: %{{[0-9]+}}:_(s64) = COPY $x5
-; CHECK: %{{[0-9]+}}:_(s64) = COPY $x6
-; CHECK: %{{[0-9]+}}:_(s64) = COPY $x7
-; CHECK: $x0 = COPY %[[ARG0]]
 define i64 @args_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3,
+  ; CHECK-LABEL: name: args_i64
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
+  ; CHECK:   [[COPY5:%[0-9]+]]:_(s64) = COPY $x5
+  ; CHECK:   [[COPY6:%[0-9]+]]:_(s64) = COPY $x6
+  ; CHECK:   [[COPY7:%[0-9]+]]:_(s64) = COPY $x7
+  ; CHECK:   $x0 = COPY [[COPY]](s64)
+  ; CHECK:   RET_ReallyLR implicit $x0
                      i64 %x4, i64 %x5, i64 %x6, i64 %x7) {
   ret i64 %x0
 }
 
 
-; CHECK-LABEL: name: args_ptrs
-; CHECK: %[[ARG0:[0-9]+]]:_(p0) = COPY $x0
-; CHECK: %{{[0-9]+}}:_(p0) = COPY $x1
-; CHECK: %{{[0-9]+}}:_(p0) = COPY $x2
-; CHECK: %{{[0-9]+}}:_(p0) = COPY $x3
-; CHECK: %{{[0-9]+}}:_(p0) = COPY $x4
-; CHECK: %{{[0-9]+}}:_(p0) = COPY $x5
-; CHECK: %{{[0-9]+}}:_(p0) = COPY $x6
-; CHECK: %{{[0-9]+}}:_(p0) = COPY $x7
-; CHECK: $x0 = COPY %[[ARG0]]
 define i8* @args_ptrs(i8* %x0, i16* %x1, <2 x i8>* %x2, {i8, i16, i32}* %x3,
+  ; CHECK-LABEL: name: args_ptrs
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(p0) = COPY $x3
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(p0) = COPY $x4
+  ; CHECK:   [[COPY5:%[0-9]+]]:_(p0) = COPY $x5
+  ; CHECK:   [[COPY6:%[0-9]+]]:_(p0) = COPY $x6
+  ; CHECK:   [[COPY7:%[0-9]+]]:_(p0) = COPY $x7
+  ; CHECK:   $x0 = COPY [[COPY]](p0)
+  ; CHECK:   RET_ReallyLR implicit $x0
                       [3 x float]* %x4, double* %x5, i8* %x6, i8* %x7) {
   ret i8* %x0
 }
 
-; CHECK-LABEL: name: args_arr
-; CHECK: %[[ARG0:[0-9]+]]:_(s64) = COPY $d0
-; CHECK: $d0 = COPY %[[ARG0]]
 define [1 x double] @args_arr([1 x double] %d0) {
+  ; CHECK-LABEL: name: args_arr
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $d0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+  ; CHECK:   $d0 = COPY [[COPY]](s64)
+  ; CHECK:   RET_ReallyLR implicit $d0
   ret [1 x double] %d0
 }
 
-; CHECK-LABEL: name: test_varargs
-; CHECK: [[ANSWER:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
-; CHECK: [[D_ONE:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
-; CHECK: [[TWELVE:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-; CHECK: [[THREE:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
-; CHECK: [[ONE:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-; CHECK: [[FOUR:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-; CHECK: [[F_ONE:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-; CHECK: [[TWO:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
-
-; CHECK: $w0 = COPY [[ANSWER]]
-; CHECK: $d0 = COPY [[D_ONE]]
-; CHECK: $x1 = COPY [[TWELVE]]
-; CHECK: [[THREE_TMP:%[0-9]+]]:_(s32) = G_ANYEXT [[THREE]]
-; CHECK: $w2 = COPY [[THREE_TMP]](s32)
-; CHECK: [[ONE_TMP:%[0-9]+]]:_(s32) = G_ANYEXT [[ONE]]
-; CHECK: $w3 = COPY [[ONE_TMP]](s32)
-; CHECK: $w4 = COPY [[FOUR]](s32)
-; CHECK: $s1 = COPY [[F_ONE]](s32)
-; CHECK: $d2 = COPY [[TWO]](s64)
 declare void @varargs(i32, double, i64, ...)
 define void @test_varargs() {
+  ; CHECK-LABEL: name: test_varargs
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+  ; CHECK:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+  ; CHECK:   [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
+  ; CHECK:   [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+  ; CHECK:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+  ; CHECK:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; CHECK:   [[C7:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $w0 = COPY [[C]](s32)
+  ; CHECK:   $d0 = COPY [[C1]](s64)
+  ; CHECK:   $x1 = COPY [[C2]](s64)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C3]](s8)
+  ; CHECK:   $w2 = COPY [[ANYEXT]](s32)
+  ; CHECK:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[C4]](s16)
+  ; CHECK:   $w3 = COPY [[ANYEXT1]](s32)
+  ; CHECK:   $w4 = COPY [[C5]](s32)
+  ; CHECK:   $s1 = COPY [[C6]](s32)
+  ; CHECK:   $d2 = COPY [[C7]](s64)
+  ; CHECK:   BL @varargs, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $w0, implicit $d0, implicit $x1, implicit $w2, implicit $w3, implicit $w4, implicit $s1, implicit $d2
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   RET_ReallyLR
   call void(i32, double, i64, ...) @varargs(i32 42, double 1.0, i64 12, i8 3, i16 1, i32 4, float 1.0, double 2.0)
   ret void
 }
@@ -88,46 +104,123 @@ define void @test_varargs() {
 ; stack. The ADJCALLSTACK ops should reflect this, even if the difference is
 ; theoretical.
 declare void @stack_ext_needed([8 x i64], i8 signext %in)
-; CHECK-LABEL: name: test_stack_ext_needed
-; CHECK: ADJCALLSTACKDOWN 8
-; CHECK: BL @stack_ext_needed
-; CHECK: ADJCALLSTACKUP 8
 define void @test_stack_ext_needed() {
+  ; CHECK-LABEL: name: test_stack_ext_needed
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+  ; CHECK:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 42
+  ; CHECK:   ADJCALLSTACKDOWN 8, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $x0 = COPY [[DEF]](s64)
+  ; CHECK:   $x1 = COPY [[DEF]](s64)
+  ; CHECK:   $x2 = COPY [[DEF]](s64)
+  ; CHECK:   $x3 = COPY [[DEF]](s64)
+  ; CHECK:   $x4 = COPY [[DEF]](s64)
+  ; CHECK:   $x5 = COPY [[DEF]](s64)
+  ; CHECK:   $x6 = COPY [[DEF]](s64)
+  ; CHECK:   $x7 = COPY [[DEF]](s64)
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $sp
+  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+  ; CHECK:   G_STORE [[C]](s8), [[PTR_ADD]](p0) :: (store 1 into stack)
+  ; CHECK:   BL @stack_ext_needed, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7
+  ; CHECK:   ADJCALLSTACKUP 8, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   RET_ReallyLR
   call void @stack_ext_needed([8 x i64] undef, i8 signext 42)
   ret void
 }
 
 ; Check that we can lower incoming i128 types into constituent s64 gprs.
-; CHECK-LABEL: name: callee_s128
-; CHECK: liveins: $x0, $x1, $x2, $x3, $x4
-; CHECK: [[A1_P1:%[0-9]+]]:_(s64) = COPY $x0
-; CHECK: [[A1_P2:%[0-9]+]]:_(s64) = COPY $x1
-; CHECK: [[A1_MERGE:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[A1_P1]](s64), [[A1_P2]](s64)
-; CHECK: [[A2_P1:%[0-9]+]]:_(s64) = COPY $x2
-; CHECK: [[A2_P2:%[0-9]+]]:_(s64) = COPY $x3
-; CHECK: [[A2_MERGE:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[A2_P1]](s64), [[A2_P2]](s64)
-; CHECK: G_STORE [[A2_MERGE]](s128)
 define void @callee_s128(i128 %a, i128 %b, i128 *%ptr) {
+  ; CHECK-LABEL: name: callee_s128
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $x0, $x1, $x2, $x3, $x4
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+  ; CHECK:   [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[COPY]](s64), [[COPY1]](s64)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
+  ; CHECK:   [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[COPY2]](s64), [[COPY3]](s64)
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(p0) = COPY $x4
+  ; CHECK:   G_STORE [[MV1]](s128), [[COPY4]](p0) :: (store 16 into %ir.ptr)
+  ; CHECK:   RET_ReallyLR
   store i128 %b, i128 *%ptr
   ret void
 }
 
 ; Check we can lower outgoing s128 arguments into s64 gprs.
-; CHECK-LABEL: name: caller_s128
-; CHECK: [[PTR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK: [[LARGE_VAL:%[0-9]+]]:_(s128) = G_LOAD [[PTR]](p0)
-; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
-; CHECK: [[A1_P1:%[0-9]+]]:_(s64), [[A1_P2:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LARGE_VAL]](s128)
-; CHECK: [[A2_P1:%[0-9]+]]:_(s64), [[A2_P2:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %1(s128)
-; CHECK: $x0 = COPY [[A1_P1]](s64)
-; CHECK: $x1 = COPY [[A1_P2]](s64)
-; CHECK: $x2 = COPY [[A2_P1]](s64)
-; CHECK: $x3 = COPY [[A2_P2]](s64)
-; CHECK: $x4 = COPY [[PTR]](p0)
-; CHECK: BL @callee_s128, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4
-; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
 define void @caller_s128(i128 *%ptr) {
+  ; CHECK-LABEL: name: caller_s128
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $x0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY]](p0) :: (load 16 from %ir.ptr)
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128)
+  ; CHECK:   [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128)
+  ; CHECK:   $x0 = COPY [[UV]](s64)
+  ; CHECK:   $x1 = COPY [[UV1]](s64)
+  ; CHECK:   $x2 = COPY [[UV2]](s64)
+  ; CHECK:   $x3 = COPY [[UV3]](s64)
+  ; CHECK:   $x4 = COPY [[COPY]](p0)
+  ; CHECK:   BL @callee_s128, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   RET_ReallyLR
   %v = load i128, i128 *%ptr
   call void @callee_s128(i128 %v, i128 %v, i128 *%ptr)
   ret void
 }
+
+
+declare i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline
+
+define i32 @i8i16caller() nounwind readnone {
+  ; CHECK-LABEL: name: i8i16caller
+  ; CHECK: bb.1.entry:
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+  ; CHECK:   [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
+  ; CHECK:   [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
+  ; CHECK:   [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+  ; CHECK:   [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+  ; CHECK:   [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+  ; CHECK:   [[C8:%[0-9]+]]:_(s8) = G_CONSTANT i8 97
+  ; CHECK:   [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 98
+  ; CHECK:   [[C10:%[0-9]+]]:_(s8) = G_CONSTANT i8 99
+  ; CHECK:   [[C11:%[0-9]+]]:_(s8) = G_CONSTANT i8 100
+  ; CHECK:   ADJCALLSTACKDOWN 32, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $x0 = COPY [[C]](s64)
+  ; CHECK:   $x1 = COPY [[C1]](s64)
+  ; CHECK:   $x2 = COPY [[C2]](s64)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[C3]](s8)
+  ; CHECK:   $w3 = COPY [[SEXT]](s32)
+  ; CHECK:   [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[C4]](s16)
+  ; CHECK:   $w4 = COPY [[SEXT1]](s32)
+  ; CHECK:   $x5 = COPY [[C5]](s64)
+  ; CHECK:   $x6 = COPY [[C6]](s64)
+  ; CHECK:   $x7 = COPY [[C7]](s64)
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $sp
+  ; CHECK:   [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C12]](s64)
+  ; CHECK:   G_STORE [[C8]](s8), [[PTR_ADD]](p0) :: (store 1 into stack)
+  ; CHECK:   [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C13]](s64)
+  ; CHECK:   G_STORE [[C9]](s16), [[PTR_ADD1]](p0) :: (store 2 into stack + 8, align 1)
+  ; CHECK:   [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; CHECK:   [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C14]](s64)
+  ; CHECK:   G_STORE [[C10]](s8), [[PTR_ADD2]](p0) :: (store 1 into stack + 16)
+  ; CHECK:   [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+  ; CHECK:   [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C15]](s64)
+  ; CHECK:   G_STORE [[C11]](s8), [[PTR_ADD3]](p0) :: (store 1 into stack + 24)
+  ; CHECK:   BL @i8i16callee, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $w3, implicit $w4, implicit $x5, implicit $x6, implicit $x7, implicit-def $x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+  ; CHECK:   ADJCALLSTACKUP 32, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+  ; CHECK:   $w0 = COPY [[TRUNC]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+entry:
+  %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 97, i16  98, i8  99, i8  100)
+  %conv = trunc i64 %call to i32
+  ret i32 %conv
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
new file mode 100644
index 00000000000000..d69f5ffac781ad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+@lds = internal addrspace(3) global [576 x double] undef, align 16
+
+; Stores to the same address appear multiple places in the same
+; block. When sorted by offset, the merges would fail. We should form
+; two groupings of ds_write2_b64 on either side of the fence.
+define amdgpu_kernel void @same_address_fence_merge_write2() #0 {
+; GCN-LABEL: same_address_fence_merge_write2:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GCN-NEXT:    s_mov_b32 s1, 0x40100000
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_add_u32_e32 v3, 0x840, v2
+; GCN-NEXT:    v_add_u32_e32 v4, 0xc60, v2
+; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
+; GCN-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_barrier
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
+; GCN-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    s_endpgm
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+  %tmp1 = getelementptr inbounds [576 x double], [576 x double] addrspace(3)* @lds, i32 0, i32 %tmp
+  store double 4.000000e+00, double addrspace(3)* %tmp1, align 8
+  %tmp2 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 66
+  store double 4.000000e+00, double addrspace(3)* %tmp2, align 8
+  %tmp3 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 132
+  store double 4.000000e+00, double addrspace(3)* %tmp3, align 8
+  %tmp4 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 198
+  store double 4.000000e+00, double addrspace(3)* %tmp4, align 8
+  %tmp5 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 264
+  store double 4.000000e+00, double addrspace(3)* %tmp5, align 8
+  %tmp6 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 330
+  store double 4.000000e+00, double addrspace(3)* %tmp6, align 8
+  %tmp7 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 396
+  store double 4.000000e+00, double addrspace(3)* %tmp7, align 8
+  %tmp8 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 462
+  store double 4.000000e+00, double addrspace(3)* %tmp8, align 8
+  fence syncscope("workgroup") release
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("workgroup") acquire
+  store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp6, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp7, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp8, align 8
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare void @llvm.amdgcn.s.barrier() #1
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { convergent nounwind }
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll b/llvm/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll
index a7c859a1815fe1..b78909d4adb343 100644
--- a/llvm/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll
+++ b/llvm/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll
@@ -7,8 +7,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
 ; CHECK:       .p2align 2
 ; CHECK-MIPS64-LABEL: .Lxray_sled_0:
 ; CHECK-MIPS32-LABEL: $xray_sled_0:
-; CHECK-MIPS64:  b .Ltmp0
-; CHECK-MIPS32:  b $tmp0
+; CHECK-MIPS64:  b .Ltmp1
+; CHECK-MIPS32:  b $tmp1
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
@@ -24,15 +24,15 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
-; CHECK-MIPS64-LABEL: .Ltmp0:
-; CHECK-MIPS32-LABEL: $tmp0:
+; CHECK-MIPS64-LABEL: .Ltmp1:
+; CHECK-MIPS32-LABEL: $tmp1:
 ; CHECK-MIPS32:  addiu $25, $25, 52
   ret i32 0
 ; CHECK:       .p2align 2
 ; CHECK-MIPS64-LABEL: .Lxray_sled_1:
+; CHECK-MIPS64-NEXT:   b .Ltmp2
 ; CHECK-MIPS32-LABEL: $xray_sled_1:
-; CHECK-MIPS64:  b .Ltmp1
-; CHECK-MIPS32:  b $tmp1
+; CHECK-MIPS32-NEXT:   b $tmp2
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
@@ -48,8 +48,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
-; CHECK-MIPS64-LABEL: .Ltmp1:
-; CHECK-MIPS32-LABEL: $tmp1:
+; CHECK-MIPS64-LABEL: .Ltmp2:
+; CHECK-MIPS32-LABEL: $tmp2:
 ; CHECK-MIPS32:  addiu $25, $25, 52
 }
 ; CHECK:  .section xray_instr_map,{{.*}}
@@ -63,9 +63,9 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
 define i32 @bar(i32 %i) nounwind noinline uwtable "function-instrument"="xray-always" {
 ; CHECK:       .p2align 2
 ; CHECK-MIPS64-LABEL: .Lxray_sled_2:
+; CHECK-MIPS64-NEXT:   b .Ltmp4
 ; CHECK-MIPS32-LABEL: $xray_sled_2:
-; CHECK-MIPS64:  b .Ltmp2
-; CHECK-MIPS32:  b $tmp2
+; CHECK-MIPS32-NEXT:   b $tmp4
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
@@ -81,8 +81,8 @@ define i32 @bar(i32 %i) nounwind noinline uwtable "function-instrument"="xray-al
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
-; CHECK-MIPS64-LABEL: .Ltmp2:
-; CHECK-MIPS32-LABEL: $tmp2:
+; CHECK-MIPS64-LABEL: .Ltmp4:
+; CHECK-MIPS32-LABEL: $tmp4:
 ; CHECK-MIPS32:  addiu $25, $25, 52
 Test:
   %cond = icmp eq i32 %i, 0
@@ -91,9 +91,9 @@ IsEqual:
   ret i32 0
 ; CHECK:       .p2align 2
 ; CHECK-MIPS64-LABEL: .Lxray_sled_3:
+; CHECK-MIPS64-NEXT:   b .Ltmp5
 ; CHECK-MIPS32-LABEL: $xray_sled_3:
-; CHECK-MIPS64:  b .Ltmp3
-; CHECK-MIPS32:  b $tmp3
+; CHECK-MIPS32-NEXT:   b $tmp5
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
@@ -109,16 +109,16 @@ IsEqual:
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
-; CHECK-MIPS64-LABEL: .Ltmp3:
-; CHECK-MIPS32-LABEL: $tmp3:
+; CHECK-MIPS64-LABEL: .Ltmp5:
+; CHECK-MIPS32-LABEL: $tmp5:
 ; CHECK-MIPS32:  addiu $25, $25, 52 
 NotEqual:
   ret i32 1
 ; CHECK:       .p2align 2
 ; CHECK-MIPS64-LABEL: .Lxray_sled_4:
+; CHECK-MIPS64-NEXT:   b .Ltmp6
 ; CHECK-MIPS32-LABEL: $xray_sled_4:
-; CHECK-MIPS64:  b .Ltmp4
-; CHECK-MIPS32:  b $tmp4
+; CHECK-MIPS32-NEXT:   b $tmp6
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
@@ -134,8 +134,8 @@ NotEqual:
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
 ; CHECK-MIPS64:  nop
-; CHECK-MIPS64-LABEL: .Ltmp4:
-; CHECK-MIPS32-LABEL: $tmp4:
+; CHECK-MIPS64-LABEL: .Ltmp6:
+; CHECK-MIPS32-LABEL: $tmp6:
 ; CHECK-MIPS32:  addiu $25, $25, 52
 }
 ; CHECK: .section xray_instr_map,{{.*}}
diff --git a/llvm/test/CodeGen/PowerPC/xray-attribute-instrumentation.ll b/llvm/test/CodeGen/PowerPC/xray-attribute-instrumentation.ll
index 63c34be51ec374..f73679001158d6 100644
--- a/llvm/test/CodeGen/PowerPC/xray-attribute-instrumentation.ll
+++ b/llvm/test/CodeGen/PowerPC/xray-attribute-instrumentation.ll
@@ -3,6 +3,8 @@
 ; RUN:    -relocation-model=pic < %s | FileCheck %s
 
 define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" {
+; CHECK-LABEL: foo:
+; CHECK-NEXT:  .Lfunc_begin0:
 ; CHECK-LABEL: .Ltmp0:
 ; CHECK:              b .Ltmp1
 ; CHECK-NEXT:         nop
@@ -26,14 +28,14 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
 ; CHECK:      .Lxray_sleds_start0:
 ; CHECK-NEXT: .Ltmp3:
 ; CHECK-NEXT:         .quad   .Ltmp0-.Ltmp3
-; CHECK-NEXT:         .quad   foo
+; CHECK-NEXT:         .quad   .Lfunc_begin0-(.Ltmp3+8)
 ; CHECK-NEXT:         .byte   0x00
 ; CHECK-NEXT:         .byte   0x01
 ; CHECK-NEXT:         .byte   0x02
 ; CHECK-NEXT:         .space  13
 ; CHECK-NEXT: .Ltmp4:
 ; CHECK-NEXT:         .quad   .Ltmp2-.Ltmp4
-; CHECK-NEXT:         .quad   foo
+; CHECK-NEXT:         .quad   .Lfunc_begin0-(.Ltmp4+8)
 ; CHECK-NEXT:         .byte   0x01
 ; CHECK-NEXT:         .byte   0x01
 ; CHECK-NEXT:         .byte   0x02
diff --git a/llvm/test/CodeGen/X86/xray-log-args.ll b/llvm/test/CodeGen/X86/xray-log-args.ll
index 7ec1b34094d341..812e04a483fb76 100644
--- a/llvm/test/CodeGen/X86/xray-log-args.ll
+++ b/llvm/test/CodeGen/X86/xray-log-args.ll
@@ -6,17 +6,20 @@
 define i32 @callee(i32 %arg) nounwind noinline uwtable "function-instrument"="xray-always" "xray-log-args"="1" {
   ret i32 %arg
 }
+; CHECK-LABEL: callee:
+; CHECK-NEXT:  Lfunc_begin0:
+
 ; CHECK-LABEL: Lxray_sleds_start0:
 ; CHECK-NEXT:  Ltmp0:
 ; CHECK-NEXT:   .quad {{\.?}}Lxray_sled_0-{{\.?}}Ltmp0
-; CHECK-NEXT:   .quad {{_?}}callee
+; CHECK-NEXT:   .quad {{\.?}}Lfunc_begin0-({{\.?}}Ltmp0+8)
 ; CHECK-NEXT:   .byte 0x03
 ; CHECK-NEXT:   .byte 0x01
 ; CHECK-NEXT:   .byte 0x02
 ; CHECK:        .{{(zero|space)}}  13
 ; CHECK:       Ltmp1:
 ; CHECK-NEXT:   .quad {{\.?}}Lxray_sled_1-{{\.?}}Ltmp1
-; CHECK-NEXT:   .quad {{_?}}callee
+; CHECK-NEXT:   .quad {{\.?}}Lfunc_begin0-({{\.?}}Ltmp1+8)
 ; CHECK-NEXT:   .byte 0x01
 ; CHECK-NEXT:   .byte 0x01
 ; CHECK-NEXT:   .byte 0x02
@@ -29,14 +32,14 @@ define i32 @caller(i32 %arg) nounwind noinline uwtable "function-instrument"="xr
 ; CHECK-LABEL: Lxray_sleds_start1:
 ; CHECK-NEXT:  Ltmp3:
 ; CHECK-NEXT:   .quad {{\.?}}Lxray_sled_2-{{\.?}}Ltmp3
-; CHECK-NEXT:   .quad {{_?}}caller
+; CHECK-NEXT:   .quad {{\.?}}Lfunc_begin1-({{\.?}}Ltmp3+8)
 ; CHECK-NEXT:   .byte 0x03
 ; CHECK-NEXT:   .byte 0x01
 ; CHECK-NEXT:   .byte 0x02
 ; CHECK:  .{{(zero|space)}}  13
 ; CHECK:       Ltmp4:
 ; CHECK-NEXT:   .quad {{\.?}}Lxray_sled_3-{{\.?}}Ltmp4
-; CHECK-NEXT:   .quad {{_?}}caller
+; CHECK-NEXT:   .quad {{\.?}}Lfunc_begin1-({{\.?}}Ltmp4+8)
 ; CHECK-NEXT:   .byte 0x02
 ; CHECK-NEXT:   .byte 0x01
 ; CHECK-NEXT:   .byte 0x02
diff --git a/llvm/test/MC/RISCV/debug-valid.s b/llvm/test/MC/RISCV/debug-valid.s
new file mode 100644
index 00000000000000..89b8f001b2cefb
--- /dev/null
+++ b/llvm/test/MC/RISCV/debug-valid.s
@@ -0,0 +1,14 @@
+# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
+# RUN:     | llvm-objdump -M no-aliases -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
+# RUN:     | llvm-objdump -M no-aliases -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: dret
+# CHECK: encoding: [0x73,0x00,0x20,0x7b]
+dret
diff --git a/llvm/tools/llvm-xray/xray-extract.cpp b/llvm/tools/llvm-xray/xray-extract.cpp
index 6ea0f59e5eb956..8304d2d27afa34 100644
--- a/llvm/tools/llvm-xray/xray-extract.cpp
+++ b/llvm/tools/llvm-xray/xray-extract.cpp
@@ -63,9 +63,9 @@ void exportAsYAML(const InstrumentationMap &Map, raw_ostream &OS,
     auto FuncId = Map.getFunctionId(Sled.Function);
     if (!FuncId)
       return;
-    YAMLSleds.push_back({*FuncId, Sled.Address, Sled.Function, Sled.Kind,
-                         Sled.AlwaysInstrument,
-                         ExtractSymbolize ? FH.SymbolOrNumber(*FuncId) : ""});
+    YAMLSleds.push_back(
+        {*FuncId, Sled.Address, Sled.Function, Sled.Kind, Sled.AlwaysInstrument,
+         ExtractSymbolize ? FH.SymbolOrNumber(*FuncId) : "", Sled.Version});
   }
   Output Out(OS, nullptr, 0);
   Out << YAMLSleds;
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index efefd2b90be8c9..0f15a478e4523c 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -297,6 +297,18 @@ TYPED_TEST(BitVectorTest, SimpleFindOpsSingleWord) {
   EXPECT_EQ(-1, A.find_last_unset());
 
   A.resize(20);
+  ASSERT_TRUE(SmallBitVectorIsSmallMode(A));
+  EXPECT_EQ(-1, A.find_first());
+  EXPECT_EQ(-1, A.find_last());
+  EXPECT_EQ(-1, A.find_next(5));
+  EXPECT_EQ(-1, A.find_next(19));
+  EXPECT_EQ(-1, A.find_prev(5));
+  EXPECT_EQ(-1, A.find_prev(20));
+  EXPECT_EQ(0, A.find_first_unset());
+  EXPECT_EQ(19, A.find_last_unset());
+  EXPECT_EQ(6, A.find_next_unset(5));
+  EXPECT_EQ(-1, A.find_next_unset(19));
+
   A.set(3);
   A.set(4);
   A.set(16);
@@ -319,6 +331,19 @@ TYPED_TEST(BitVectorTest, SimpleFindOpsSingleWord) {
   EXPECT_EQ(5, A.find_next_unset(4));
   EXPECT_EQ(13, A.find_next_unset(12));
   EXPECT_EQ(17, A.find_next_unset(15));
+
+  A.set();
+  ASSERT_TRUE(SmallBitVectorIsSmallMode(A));
+  EXPECT_EQ(0, A.find_first());
+  EXPECT_EQ(19, A.find_last());
+  EXPECT_EQ(6, A.find_next(5));
+  EXPECT_EQ(-1, A.find_next(19));
+  EXPECT_EQ(4, A.find_prev(5));
+  EXPECT_EQ(19, A.find_prev(20));
+  EXPECT_EQ(-1, A.find_first_unset());
+  EXPECT_EQ(-1, A.find_last_unset());
+  EXPECT_EQ(-1, A.find_next_unset(5));
+  EXPECT_EQ(-1, A.find_next_unset(19));
 }
 
 TEST(BitVectorTest, FindInRangeMultiWord) {
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 765dd54fe5ca64..b613cd2eccb9f6 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -157,6 +157,7 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
     bool UpdateRefCount, bool HasCloseModifier) {
   void *rc = NULL;
   IsHostPtr = false;
+  IsNew = false;
   DataMapMtx.lock();
   LookupResult lr = lookupMapping(HstPtrBegin, Size);