diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index f32eb7213b9402..7b1b4b8dac8a20 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -22,19 +22,20 @@ set -o pipefail
 
 # Environment variables script works with:
 
-# Fetch origin/main to have an up to date merge base for main...HEAD diff.
-git fetch origin main:main
+# Set by buildkite
+: ${BUILDKITE_PULL_REQUEST_BASE_BRANCH:=}
+: ${BUILDKITE_COMMIT:=}
+: ${BUILDKITE_BRANCH:=}
+# Fetch origin to have an up to date merge base for the diff.
+git fetch origin
 # List of files affected by this commit
-: ${MODIFIED_FILES:=$(git diff --name-only main...HEAD)}
+: ${MODIFIED_FILES:=$(git diff --name-only origin/${BUILDKITE_PULL_REQUEST_BASE_BRANCH}...HEAD)}
 # Filter rules for generic windows tests
 : ${WINDOWS_AGENTS:='{"queue": "windows"}'}
 # Filter rules for generic linux tests
 : ${LINUX_AGENTS:='{"queue": "linux"}'}
 # Service agents, for interacting with Phabricator.
 : ${SERVICE_AGENTS:='{"queue": "service"}'}
-# Set by buildkite
-: ${BUILDKITE_COMMIT:=}
-: ${BUILDKITE_BRANCH:=}
 
 reviewID="$(git log --format=%B -n 1 | sed -nE 's/^Review-ID:[[:space:]]*(.+)$/\1/p')"
 if [[ "${reviewID}" != "" ]]; then
diff --git a/.github/workflows/libclc-tests.yml b/.github/workflows/libclc-tests.yml
index 29d050db2f12c0..23192f776a985e 100644
--- a/.github/workflows/libclc-tests.yml
+++ b/.github/workflows/libclc-tests.yml
@@ -36,5 +36,4 @@ jobs:
     name: Test libclc
     uses: ./.github/workflows/llvm-project-tests.yml
     with:
-      build_target: ''
       projects: clang;libclc
diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml
index ef5d7c7d581b7d..6bb9721956258f 100644
--- a/.github/workflows/lldb-tests.yml
+++ b/.github/workflows/lldb-tests.yml
@@ -36,5 +36,4 @@ jobs:
     name: Build lldb
     uses: ./.github/workflows/llvm-project-tests.yml
     with:
-      build_target: ''
       projects: clang;lldb
diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml
index a1404e1f1efa95..a52dd2db8035dd 100644
--- a/.github/workflows/llvm-project-tests.yml
+++ b/.github/workflows/llvm-project-tests.yml
@@ -14,7 +14,7 @@ on:
         required: false
       os_list:
         required: false
-        default: '["ubuntu-latest", "windows-2019", "macOS-11"]'
+        default: '["ubuntu-latest", "windows-2019", "macOS-13"]'
       python_version:
         required: false
         type: string
@@ -22,8 +22,9 @@ on:
   workflow_call:
     inputs:
       build_target:
-        required: true
+        required: false
         type: string
+        default: "all"
 
       projects:
         required: true
@@ -38,9 +39,7 @@ on:
         type: string
         # Use windows-2019 due to:
         # https://developercommunity.visualstudio.com/t/Prev-Issue---with-__assume-isnan-/1597317
-        # We're using a specific version of macOS due to:
-        # https://github.com/actions/virtual-environments/issues/5900
-        default: '["ubuntu-latest", "windows-2019", "macOS-11"]'
+        default: '["ubuntu-latest", "windows-2019", "macOS-13"]'
 
       python_version:
         required: false
@@ -59,6 +58,10 @@ jobs:
   lit-tests:
     name: Lit Tests
     runs-on: ${{ matrix.os }}
+    container:
+      image: ${{(startsWith(matrix.os, 'ubuntu') && 'ghcr.io/llvm/ci-ubuntu-22.04:latest') || null}}
+      volumes:
+        - /mnt/:/mnt/
     strategy:
       fail-fast: false
       matrix:
@@ -78,6 +81,7 @@ jobs:
         with:
           python-version: ${{ inputs.python_version }}
       - name: Install Ninja
+        if: runner.os != 'Linux'
         uses: llvm/actions/install-ninja@main
       # actions/checkout deletes any existing files in the new git directory,
       # so this needs to either run before ccache-action or it has to use
@@ -95,24 +99,51 @@ jobs:
           # run creates a new cache entry so we want to ensure that we have
           # enough cache space for all the tests to run at once and still
           # fit under the 10 GB limit.
-          max-size: 500M
+          # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174
+          max-size: 2G
           key: ${{ matrix.os }}
           variant: sccache
       - name: Build and Test
-        uses: llvm/actions/build-test-llvm-project@main
         env:
           # Workaround for https://github.com/actions/virtual-environments/issues/5900.
           # This should be a no-op for non-mac OSes
           PKG_CONFIG_PATH: /usr/local/Homebrew/Library/Homebrew/os/mac/pkgconfig//12
-        with:
-          cmake_args: '-GNinja -DLLVM_ENABLE_PROJECTS="${{ inputs.projects }}" -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLDB_INCLUDE_TESTS=OFF -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache ${{ inputs.extra_cmake_args }}'
-          build_target: '${{ inputs.build_target }}'
+        shell: bash
+        id: build-llvm
+        run: |
+          if [ "${{ runner.os }}" == "Linux" ]; then
+            builddir="/mnt/build/"
+            mkdir -p $builddir
+            extra_cmake_args="-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang"
+          else
+            builddir="$(pwd)"/build
+          fi
+          if [ "${{ runner.os }}" == "macOS" ]; then
+            # Workaround test failure on some lld tests on MacOS
+            # https://github.com/llvm/llvm-project/issues/81967
+            extra_cmake_args="-DLLVM_DISABLE_ASSEMBLY_FILES=ON"
+          fi
+          echo "llvm-builddir=$builddir" >> "$GITHUB_OUTPUT"
+          cmake -G Ninja \
+                -B "$builddir" \
+                -S llvm \
+                -DLLVM_ENABLE_PROJECTS="${{ inputs.projects }}" \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DLLVM_ENABLE_ASSERTIONS=ON \
+                -DLLDB_INCLUDE_TESTS=OFF \
+                -DCMAKE_C_COMPILER_LAUNCHER=sccache \
+                -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
+                $extra_cmake_args \
+                ${{ inputs.extra_cmake_args }}
+          ninja -C "$builddir" '${{ inputs.build_target }}'
 
       - name: Build and Test libclc
         if: "!startsWith(matrix.os, 'windows') && contains(inputs.projects, 'libclc')"
+        env:
+          LLVM_BUILDDIR: ${{ steps.build-llvm.outputs.llvm-builddir }}
         run: |
           # Make sure all of LLVM libraries that llvm-config needs are built.
-          ninja -C build
-          cmake -G Ninja -S libclc -B libclc-build -DLLVM_DIR="$(pwd)"/build/lib/cmake/llvm -DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl"
+          ninja -C "$LLVM_BUILDDIR"
+          cmake -G Ninja -S libclc -B libclc-build -DLLVM_DIR="$LLVM_BUILDDIR"/lib/cmake/llvm -DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl"
           ninja -C libclc-build
           ninja -C libclc-build test
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index cc9855ce182b2b..64d60bc3da45e1 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -27,31 +27,13 @@ concurrency:
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 
 jobs:
-  check_all:
+  check-all:
     if: github.repository_owner == 'llvm'
-    name: Test llvm,clang,libclc
+    name: Build and Test
     uses: ./.github/workflows/llvm-project-tests.yml
     with:
       build_target: check-all
-      projects: clang;libclc
-
-  # These need to be separate from the check_all job, becuase there is not enough disk
-  # space to build all these projects on Windows.
-  build_lldb:
-    if: github.repository_owner == 'llvm'
-    name: Build lldb
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: ''
-      projects: clang;lldb
-
-  check_lld:
-    if: github.repository_owner == 'llvm'
-    name: Test lld
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-lld
-      projects: lld
+      projects: clang;lld;libclc;lldb
 
   abi-dump-setup:
     if: github.repository_owner == 'llvm'
@@ -60,6 +42,7 @@ jobs:
       BASELINE_REF: ${{ steps.vars.outputs.BASELINE_REF }}
       ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
       BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
+      BASELINE_VERSION_MINOR: ${{ steps.vars.outputs.BASELINE_VERSION_MINOR }}
       LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
       LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }}
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }}
@@ -76,7 +59,14 @@ jobs:
       - name: Setup Variables
         id: vars
         run: |
-          if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 ] || [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
+          # C++ ABI:
+          # 18.1.0 we aren't doing ABI checks.
+          # 18.1.1 We want to check 18.1.0.
+          # C ABI:
+          # 18.1.0 We want to check 17.0.x
+          # 18.1.1 We want to check 18.1.0
+          echo "BASELINE_VERSION_MINOR=1" >> "$GITHUB_OUTPUT"
+          if [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
             {
               echo "BASELINE_VERSION_MAJOR=$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))"
               echo "ABI_HEADERS=llvm-c"
@@ -100,7 +90,7 @@ jobs:
         include:
           - name: build-baseline
             llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}
-            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.0.0
+            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MINOR }}.0
             repo: llvm/llvm-project
           - name: build-latest
             llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }}
@@ -143,7 +133,7 @@ jobs:
           else
             touch llvm.symbols
           fi
-          abi-dumper "$EXTRA_ARGS" -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o ${{ matrix.ref }}.abi ./install/lib/libLLVM.so
+          abi-dumper $EXTRA_ARGS -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o ${{ matrix.ref }}.abi ./install/lib/libLLVM.so
           # Remove symbol versioning from dumps, so we can compare across major versions.
           sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi
       - name: Upload ABI file
@@ -193,7 +183,7 @@ jobs:
           # FIXME: Reading of gzip'd abi files on the GitHub runners stop
           # working some time in March of 2021, likely due to a change in the
           # runner's environment.
-          abi-compliance-checker "$EXTRA_ARGS" -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
+          abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
       - name: Upload ABI Comparison
         if: always()
         uses: actions/upload-artifact@v3
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 5223089ee8a93d..3d0c23917bd403 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -1,5 +1,9 @@
 name: "Check code formatting"
-on: pull_request_target
+on:
+  pull_request_target:
+    branches:
+      - main
+
 permissions:
   pull-requests: write
 
diff --git a/.github/workflows/release-lit.yml b/.github/workflows/release-lit.yml
index 36b0b6edd518fc..0316ba406041d6 100644
--- a/.github/workflows/release-lit.yml
+++ b/.github/workflows/release-lit.yml
@@ -58,7 +58,7 @@ jobs:
           cd llvm/utils/lit
           # Remove 'dev' suffix from lit version.
           sed -i 's/ + "dev"//g' lit/__init__.py
-          python3 setup.py sdist
+          python3 setup.py sdist bdist_wheel
 
       - name: Upload lit to test.pypi.org
         uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index f2a831ad3577ad..53da8662b0203a 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -28,6 +28,7 @@ jobs:
     name: Create a New Release
     runs-on: ubuntu-latest
     needs: validate-tag
+
     steps:
       - name: Install Dependencies
         run: |
@@ -40,8 +41,9 @@ jobs:
       - name: Create Release
         env:
           GITHUB_TOKEN: ${{ github.token }}
+          USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
         run: |
-          ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --release ${{ needs.validate-tag.outputs.release-version }} --user ${{ github.actor }} create
+          ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --release ${{ needs.validate-tag.outputs.release-version }} --user ${{ github.actor }} --user-token "$USER_TOKEN" create
   release-documentation:
     name: Build and Upload Release Documentation
     needs:
diff --git a/.github/workflows/version-check.py b/.github/workflows/version-check.py
index 7f805f304e3d76..f75fd50300881b 100755
--- a/.github/workflows/version-check.py
+++ b/.github/workflows/version-check.py
@@ -16,7 +16,7 @@ def get_version_from_tag(tag):
 
     m = re.match("llvmorg-([0-9]+)-init", tag)
     if m:
-        return (m.group(1), "0", "0")
+        return (m.group(1), "1", "0")
 
     raise Exception(f"error: Tag is not valid: {tag}")
 
diff --git a/clang-tools-extra/clangd/HeuristicResolver.cpp b/clang-tools-extra/clangd/HeuristicResolver.cpp
index 3c147b6b582bf0..26d54200eeffd2 100644
--- a/clang-tools-extra/clangd/HeuristicResolver.cpp
+++ b/clang-tools-extra/clangd/HeuristicResolver.cpp
@@ -16,6 +16,80 @@
 namespace clang {
 namespace clangd {
 
+namespace {
+
+// Helper class for implementing HeuristicResolver.
+// Unlike HeuristicResolver which is a long-lived class,
+// a new instance of this class is created for every external
+// call into a HeuristicResolver operation. That allows this
+// class to store state that's local to such a top-level call,
+// particularly "recursion protection sets" that keep track of
+// nodes that have already been seen to avoid infinite recursion.
+class HeuristicResolverImpl {
+public:
+  HeuristicResolverImpl(ASTContext &Ctx) : Ctx(Ctx) {}
+
+  // These functions match the public interface of HeuristicResolver
+  // (but aren't const since they may modify the recursion protection sets).
+  std::vector<const NamedDecl *>
+  resolveMemberExpr(const CXXDependentScopeMemberExpr *ME);
+  std::vector<const NamedDecl *>
+  resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE);
+  std::vector<const NamedDecl *> resolveTypeOfCallExpr(const CallExpr *CE);
+  std::vector<const NamedDecl *> resolveCalleeOfCallExpr(const CallExpr *CE);
+  std::vector<const NamedDecl *>
+  resolveUsingValueDecl(const UnresolvedUsingValueDecl *UUVD);
+  std::vector<const NamedDecl *>
+  resolveDependentNameType(const DependentNameType *DNT);
+  std::vector<const NamedDecl *> resolveTemplateSpecializationType(
+      const DependentTemplateSpecializationType *DTST);
+  const Type *resolveNestedNameSpecifierToType(const NestedNameSpecifier *NNS);
+  const Type *getPointeeType(const Type *T);
+
+private:
+  ASTContext &Ctx;
+
+  // Recursion protection sets
+  llvm::SmallSet<const DependentNameType *, 4> SeenDependentNameTypes;
+
+  // Given a tag-decl type and a member name, heuristically resolve the
+  // name to one or more declarations.
+  // The current heuristic is simply to look up the name in the primary
+  // template. This is a heuristic because the template could potentially
+  // have specializations that declare different members.
+  // Multiple declarations could be returned if the name is overloaded
+  // (e.g. an overloaded method in the primary template).
+  // This heuristic will give the desired answer in many cases, e.g.
+  // for a call to vector<T>::size().
+  std::vector<const NamedDecl *>
+  resolveDependentMember(const Type *T, DeclarationName Name,
+                         llvm::function_ref<bool(const NamedDecl *ND)> Filter);
+
+  // Try to heuristically resolve the type of a possibly-dependent expression
+  // `E`.
+  const Type *resolveExprToType(const Expr *E);
+  std::vector<const NamedDecl *> resolveExprToDecls(const Expr *E);
+
+  // Helper function for HeuristicResolver::resolveDependentMember()
+  // which takes a possibly-dependent type `T` and heuristically
+  // resolves it to a CXXRecordDecl in which we can try name lookup.
+  CXXRecordDecl *resolveTypeToRecordDecl(const Type *T);
+
+  // This is a reimplementation of CXXRecordDecl::lookupDependentName()
+  // so that the implementation can call into other HeuristicResolver helpers.
+  // FIXME: Once HeuristicResolver is upstreamed to the clang libraries
+  // (https://github.com/clangd/clangd/discussions/1662),
+  // CXXRecordDecl::lookupDepenedentName() can be removed, and its call sites
+  // can be modified to benefit from the more comprehensive heuristics offered
+  // by HeuristicResolver instead.
+  std::vector<const NamedDecl *>
+  lookupDependentName(CXXRecordDecl *RD, DeclarationName Name,
+                      llvm::function_ref<bool(const NamedDecl *ND)> Filter);
+  bool findOrdinaryMemberInDependentClasses(const CXXBaseSpecifier *Specifier,
+                                            CXXBasePath &Path,
+                                            DeclarationName Name);
+};
+
 // Convenience lambdas for use as the 'Filter' parameter of
 // HeuristicResolver::resolveDependentMember().
 const auto NoFilter = [](const NamedDecl *D) { return true; };
@@ -31,8 +105,6 @@ const auto TemplateFilter = [](const NamedDecl *D) {
   return isa<TemplateDecl>(D);
 };
 
-namespace {
-
 const Type *resolveDeclsToType(const std::vector<const NamedDecl *> &Decls,
                                ASTContext &Ctx) {
   if (Decls.size() != 1) // Names an overload set -- just bail.
@@ -46,12 +118,10 @@ const Type *resolveDeclsToType(const std::vector<const NamedDecl *> &Decls,
   return nullptr;
 }
 
-} // namespace
-
 // Helper function for HeuristicResolver::resolveDependentMember()
 // which takes a possibly-dependent type `T` and heuristically
 // resolves it to a CXXRecordDecl in which we can try name lookup.
-CXXRecordDecl *HeuristicResolver::resolveTypeToRecordDecl(const Type *T) const {
+CXXRecordDecl *HeuristicResolverImpl::resolveTypeToRecordDecl(const Type *T) {
   assert(T);
 
   // Unwrap type sugar such as type aliases.
@@ -84,7 +154,7 @@ CXXRecordDecl *HeuristicResolver::resolveTypeToRecordDecl(const Type *T) const {
   return TD->getTemplatedDecl();
 }
 
-const Type *HeuristicResolver::getPointeeType(const Type *T) const {
+const Type *HeuristicResolverImpl::getPointeeType(const Type *T) {
   if (!T)
     return nullptr;
 
@@ -117,8 +187,8 @@ const Type *HeuristicResolver::getPointeeType(const Type *T) const {
   return FirstArg.getAsType().getTypePtrOrNull();
 }
 
-std::vector<const NamedDecl *> HeuristicResolver::resolveMemberExpr(
-    const CXXDependentScopeMemberExpr *ME) const {
+std::vector<const NamedDecl *> HeuristicResolverImpl::resolveMemberExpr(
+    const CXXDependentScopeMemberExpr *ME) {
   // If the expression has a qualifier, try resolving the member inside the
   // qualifier's type.
   // Note that we cannot use a NonStaticFilter in either case, for a couple
@@ -164,14 +234,14 @@ std::vector<const NamedDecl *> HeuristicResolver::resolveMemberExpr(
   return resolveDependentMember(BaseType, ME->getMember(), NoFilter);
 }
 
-std::vector<const NamedDecl *> HeuristicResolver::resolveDeclRefExpr(
-    const DependentScopeDeclRefExpr *RE) const {
+std::vector<const NamedDecl *>
+HeuristicResolverImpl::resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE) {
   return resolveDependentMember(RE->getQualifier()->getAsType(),
                                 RE->getDeclName(), StaticFilter);
 }
 
 std::vector<const NamedDecl *>
-HeuristicResolver::resolveTypeOfCallExpr(const CallExpr *CE) const {
+HeuristicResolverImpl::resolveTypeOfCallExpr(const CallExpr *CE) {
   const auto *CalleeType = resolveExprToType(CE->getCallee());
   if (!CalleeType)
     return {};
@@ -187,7 +257,7 @@ HeuristicResolver::resolveTypeOfCallExpr(const CallExpr *CE) const {
 }
 
 std::vector<const NamedDecl *>
-HeuristicResolver::resolveCalleeOfCallExpr(const CallExpr *CE) const {
+HeuristicResolverImpl::resolveCalleeOfCallExpr(const CallExpr *CE) {
   if (const auto *ND = dyn_cast_or_null<NamedDecl>(CE->getCalleeDecl())) {
     return {ND};
   }
@@ -195,29 +265,31 @@ HeuristicResolver::resolveCalleeOfCallExpr(const CallExpr *CE) const {
   return resolveExprToDecls(CE->getCallee());
 }
 
-std::vector<const NamedDecl *> HeuristicResolver::resolveUsingValueDecl(
-    const UnresolvedUsingValueDecl *UUVD) const {
+std::vector<const NamedDecl *> HeuristicResolverImpl::resolveUsingValueDecl(
+    const UnresolvedUsingValueDecl *UUVD) {
   return resolveDependentMember(UUVD->getQualifier()->getAsType(),
                                 UUVD->getNameInfo().getName(), ValueFilter);
 }
 
-std::vector<const NamedDecl *> HeuristicResolver::resolveDependentNameType(
-    const DependentNameType *DNT) const {
+std::vector<const NamedDecl *>
+HeuristicResolverImpl::resolveDependentNameType(const DependentNameType *DNT) {
+  if (auto [_, inserted] = SeenDependentNameTypes.insert(DNT); !inserted)
+    return {};
   return resolveDependentMember(
       resolveNestedNameSpecifierToType(DNT->getQualifier()),
       DNT->getIdentifier(), TypeFilter);
 }
 
 std::vector<const NamedDecl *>
-HeuristicResolver::resolveTemplateSpecializationType(
-    const DependentTemplateSpecializationType *DTST) const {
+HeuristicResolverImpl::resolveTemplateSpecializationType(
+    const DependentTemplateSpecializationType *DTST) {
   return resolveDependentMember(
       resolveNestedNameSpecifierToType(DTST->getQualifier()),
       DTST->getIdentifier(), TemplateFilter);
 }
 
 std::vector<const NamedDecl *>
-HeuristicResolver::resolveExprToDecls(const Expr *E) const {
+HeuristicResolverImpl::resolveExprToDecls(const Expr *E) {
   if (const auto *ME = dyn_cast<CXXDependentScopeMemberExpr>(E)) {
     return resolveMemberExpr(ME);
   }
@@ -236,7 +308,7 @@ HeuristicResolver::resolveExprToDecls(const Expr *E) const {
   return {};
 }
 
-const Type *HeuristicResolver::resolveExprToType(const Expr *E) const {
+const Type *HeuristicResolverImpl::resolveExprToType(const Expr *E) {
   std::vector<const NamedDecl *> Decls = resolveExprToDecls(E);
   if (!Decls.empty())
     return resolveDeclsToType(Decls, Ctx);
@@ -244,8 +316,8 @@ const Type *HeuristicResolver::resolveExprToType(const Expr *E) const {
   return E->getType().getTypePtr();
 }
 
-const Type *HeuristicResolver::resolveNestedNameSpecifierToType(
-    const NestedNameSpecifier *NNS) const {
+const Type *HeuristicResolverImpl::resolveNestedNameSpecifierToType(
+    const NestedNameSpecifier *NNS) {
   if (!NNS)
     return nullptr;
 
@@ -270,8 +342,6 @@ const Type *HeuristicResolver::resolveNestedNameSpecifierToType(
   return nullptr;
 }
 
-namespace {
-
 bool isOrdinaryMember(const NamedDecl *ND) {
   return ND->isInIdentifierNamespace(Decl::IDNS_Ordinary | Decl::IDNS_Tag |
                                      Decl::IDNS_Member);
@@ -287,11 +357,9 @@ bool findOrdinaryMember(const CXXRecordDecl *RD, CXXBasePath &Path,
   return false;
 }
 
-} // namespace
-
-bool HeuristicResolver::findOrdinaryMemberInDependentClasses(
+bool HeuristicResolverImpl::findOrdinaryMemberInDependentClasses(
     const CXXBaseSpecifier *Specifier, CXXBasePath &Path,
-    DeclarationName Name) const {
+    DeclarationName Name) {
   CXXRecordDecl *RD =
       resolveTypeToRecordDecl(Specifier->getType().getTypePtr());
   if (!RD)
@@ -299,9 +367,9 @@ bool HeuristicResolver::findOrdinaryMemberInDependentClasses(
   return findOrdinaryMember(RD, Path, Name);
 }
 
-std::vector<const NamedDecl *> HeuristicResolver::lookupDependentName(
+std::vector<const NamedDecl *> HeuristicResolverImpl::lookupDependentName(
     CXXRecordDecl *RD, DeclarationName Name,
-    llvm::function_ref<bool(const NamedDecl *ND)> Filter) const {
+    llvm::function_ref<bool(const NamedDecl *ND)> Filter) {
   std::vector<const NamedDecl *> Results;
 
   // Lookup in the class.
@@ -332,9 +400,9 @@ std::vector<const NamedDecl *> HeuristicResolver::lookupDependentName(
   return Results;
 }
 
-std::vector<const NamedDecl *> HeuristicResolver::resolveDependentMember(
+std::vector<const NamedDecl *> HeuristicResolverImpl::resolveDependentMember(
     const Type *T, DeclarationName Name,
-    llvm::function_ref<bool(const NamedDecl *ND)> Filter) const {
+    llvm::function_ref<bool(const NamedDecl *ND)> Filter) {
   if (!T)
     return {};
   if (auto *ET = T->getAs<EnumType>()) {
@@ -349,6 +417,44 @@ std::vector<const NamedDecl *> HeuristicResolver::resolveDependentMember(
   }
   return {};
 }
+} // namespace
+
+std::vector<const NamedDecl *> HeuristicResolver::resolveMemberExpr(
+    const CXXDependentScopeMemberExpr *ME) const {
+  return HeuristicResolverImpl(Ctx).resolveMemberExpr(ME);
+}
+std::vector<const NamedDecl *> HeuristicResolver::resolveDeclRefExpr(
+    const DependentScopeDeclRefExpr *RE) const {
+  return HeuristicResolverImpl(Ctx).resolveDeclRefExpr(RE);
+}
+std::vector<const NamedDecl *>
+HeuristicResolver::resolveTypeOfCallExpr(const CallExpr *CE) const {
+  return HeuristicResolverImpl(Ctx).resolveTypeOfCallExpr(CE);
+}
+std::vector<const NamedDecl *>
+HeuristicResolver::resolveCalleeOfCallExpr(const CallExpr *CE) const {
+  return HeuristicResolverImpl(Ctx).resolveCalleeOfCallExpr(CE);
+}
+std::vector<const NamedDecl *> HeuristicResolver::resolveUsingValueDecl(
+    const UnresolvedUsingValueDecl *UUVD) const {
+  return HeuristicResolverImpl(Ctx).resolveUsingValueDecl(UUVD);
+}
+std::vector<const NamedDecl *> HeuristicResolver::resolveDependentNameType(
+    const DependentNameType *DNT) const {
+  return HeuristicResolverImpl(Ctx).resolveDependentNameType(DNT);
+}
+std::vector<const NamedDecl *>
+HeuristicResolver::resolveTemplateSpecializationType(
+    const DependentTemplateSpecializationType *DTST) const {
+  return HeuristicResolverImpl(Ctx).resolveTemplateSpecializationType(DTST);
+}
+const Type *HeuristicResolver::resolveNestedNameSpecifierToType(
+    const NestedNameSpecifier *NNS) const {
+  return HeuristicResolverImpl(Ctx).resolveNestedNameSpecifierToType(NNS);
+}
+const Type *HeuristicResolver::getPointeeType(const Type *T) const {
+  return HeuristicResolverImpl(Ctx).getPointeeType(T);
+}
 
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/HeuristicResolver.h b/clang-tools-extra/clangd/HeuristicResolver.h
index dc04123d37593c..dcc063bbc4adc0 100644
--- a/clang-tools-extra/clangd/HeuristicResolver.h
+++ b/clang-tools-extra/clangd/HeuristicResolver.h
@@ -77,43 +77,6 @@ class HeuristicResolver {
 
 private:
   ASTContext &Ctx;
-
-  // Given a tag-decl type and a member name, heuristically resolve the
-  // name to one or more declarations.
-  // The current heuristic is simply to look up the name in the primary
-  // template. This is a heuristic because the template could potentially
-  // have specializations that declare different members.
-  // Multiple declarations could be returned if the name is overloaded
-  // (e.g. an overloaded method in the primary template).
-  // This heuristic will give the desired answer in many cases, e.g.
-  // for a call to vector<T>::size().
-  std::vector<const NamedDecl *> resolveDependentMember(
-      const Type *T, DeclarationName Name,
-      llvm::function_ref<bool(const NamedDecl *ND)> Filter) const;
-
-  // Try to heuristically resolve the type of a possibly-dependent expression
-  // `E`.
-  const Type *resolveExprToType(const Expr *E) const;
-  std::vector<const NamedDecl *> resolveExprToDecls(const Expr *E) const;
-
-  // Helper function for HeuristicResolver::resolveDependentMember()
-  // which takes a possibly-dependent type `T` and heuristically
-  // resolves it to a CXXRecordDecl in which we can try name lookup.
-  CXXRecordDecl *resolveTypeToRecordDecl(const Type *T) const;
-
-  // This is a reimplementation of CXXRecordDecl::lookupDependentName()
-  // so that the implementation can call into other HeuristicResolver helpers.
-  // FIXME: Once HeuristicResolver is upstreamed to the clang libraries
-  // (https://github.com/clangd/clangd/discussions/1662),
-  // CXXRecordDecl::lookupDepenedentName() can be removed, and its call sites
-  // can be modified to benefit from the more comprehensive heuristics offered
-  // by HeuristicResolver instead.
-  std::vector<const NamedDecl *> lookupDependentName(
-      CXXRecordDecl *RD, DeclarationName Name,
-      llvm::function_ref<bool(const NamedDecl *ND)> Filter) const;
-  bool findOrdinaryMemberInDependentClasses(const CXXBaseSpecifier *Specifier,
-                                            CXXBasePath &Path,
-                                            DeclarationName Name) const;
 };
 
 } // namespace clangd
diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp
index 5722ca8f66eb72..5f3687ac581017 100644
--- a/clang-tools-extra/clangd/InlayHints.cpp
+++ b/clang-tools-extra/clangd/InlayHints.cpp
@@ -651,16 +651,12 @@ class InlayHintVisitor : public RecursiveASTVisitor<InlayHintVisitor> {
     // implied object argument ([over.call.func]), the list of provided
     // arguments is preceded by the implied object argument for the purposes of
     // this correspondence...
-    //
-    // However, we don't have the implied object argument
-    // for static operator() per clang::Sema::BuildCallToObjectOfClassType.
     llvm::ArrayRef<const Expr *> Args = {E->getArgs(), E->getNumArgs()};
     // We don't have the implied object argument through a function pointer
     // either.
     if (const CXXMethodDecl *Method =
             dyn_cast_or_null<CXXMethodDecl>(Callee.Decl))
-      if (Method->isInstance() &&
-          (IsFunctor || Method->hasCXXExplicitFunctionObjectParameter()))
+      if (IsFunctor || Method->hasCXXExplicitFunctionObjectParameter())
         Args = Args.drop_front(1);
     processCall(Callee, Args);
     return true;
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 29cff68cf03b2e..0af6036734ba53 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -1009,6 +1009,33 @@ TEST_F(TargetDeclTest, DependentTypes) {
       )cpp";
   EXPECT_DECLS("DependentTemplateSpecializationTypeLoc",
                "template <typename> struct B");
+
+  // Dependent name with recursive definition. We don't expect a
+  // result, but we shouldn't get into a stack overflow either.
+  Code = R"cpp(
+        template <int N>
+        struct waldo {
+          typedef typename waldo<N - 1>::type::[[next]] type;
+        };
+  )cpp";
+  EXPECT_DECLS("DependentNameTypeLoc");
+
+  // Similar to above but using mutually recursive templates.
+  Code = R"cpp(
+        template <int N>
+        struct odd;
+
+        template <int N>
+        struct even {
+          using type = typename odd<N - 1>::type::next;
+        };
+
+        template <int N>
+        struct odd {
+          using type = typename even<N - 1>::type::[[next]];
+        };
+  )cpp";
+  EXPECT_DECLS("DependentNameTypeLoc");
 }
 
 TEST_F(TargetDeclTest, TypedefCascade) {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 5758b5acbc0b56..8621444364fb20 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -51,21 +51,35 @@ Improvements to clangd
 Inlay hints
 ^^^^^^^^^^^
 
-Diagnostics
-^^^^^^^^^^^
-
-Semantic Highlighting
-^^^^^^^^^^^^^^^^^^^^^
+- Type hints
+    * Improved heuristics for showing sugared vs. desguared types
+    * Some hints which provide no information (e.g. ``<dependent-type>``) are now omitted
+- Parameter hints
+    * Parameter hints are now shown for calls through function pointers
+    * Parameter hints are now shown for calls to a class's ``operator()``
+    * No longer show bogus parameter hints for some builtins like ``__builtin_dump_struct``
 
 Compile flags
 ^^^^^^^^^^^^^
 
+- System include extractor (``--query-driver``) improvements
+    * The directory containing builtin headers is now excluded from extracted system includes
+    * Various flags which can affect the system includes (``--target``, ``--stdlib``, ``-specs``) are now forwarded to the driver
+    * Fixed a bug where clangd would sometimes try to call a driver that didn't have obj-c support with ``-x objective-c++-header``
+    * The driver path is now dot-normalized before being compared to the ``--query-driver`` pattern
+    * ``--query-driver`` is now supported by ``clangd-indexer``
+- Fixed a regression in clangd 17 where response files would not be expanded
+
 Hover
 ^^^^^
 
+- Hover now shows alignment info for fields and records
+
 Code completion
 ^^^^^^^^^^^^^^^
 
+- Refined heuristics for determining whether the use of a function can be a call or not
+
 Code actions
 ^^^^^^^^^^^^
 
@@ -75,15 +89,25 @@ Code actions
 Signature help
 ^^^^^^^^^^^^^^
 
+- Improved support for calls through function pointer types
+
 Cross-references
 ^^^^^^^^^^^^^^^^
 
+- Improved support for C++20 concepts
+- Find-references now works for labels
+- Improvements to template heuristics
+
 Objective-C
 ^^^^^^^^^^^
 
 Miscellaneous
 ^^^^^^^^^^^^^
 
+- Various stability improvements, e.g. crash fixes
+- Improved error recovery on invalid code
+- Clangd now bails gracefully on assembly and IR source files
+
 Improvements to clang-doc
 -------------------------
 
@@ -564,10 +588,15 @@ Changes in existing checks
 Removed checks
 ^^^^^^^^^^^^^^
 
-Improvements to include-fixer
+Improvements to include-cleaner
 -----------------------------
 
-The improvements are...
+- Support for ``--only-headers`` flag to limit analysis to headers matching a regex
+- Recognizes references through ``concept``s
+- Builtin headers are not analyzed
+- Handling of references through ``friend`` declarations
+- Fixes around handling of IWYU pragmas on stdlib headers
+- Improved handling around references to/from template specializations
 
 Improvements to clang-include-fixer
 -----------------------------------
diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index 1ca9138b980731..bd1f688d61a7ea 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -4,7 +4,7 @@
 
 # General Options
 set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "")
-set(LLVM_RELEASE_ENABLE_PGO ON CACHE BOOL "")
+set(LLVM_RELEASE_ENABLE_PGO OFF CACHE BOOL "")
 
 set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "")
 
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 4dc0de3a90f265..0b887288fe2cb1 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -5277,15 +5277,9 @@ the configuration (without a prefix: ``Auto``).
   Possible values:
 
   * ``SBPO_Never`` (in configuration: ``Never``)
-    Never put a space before opening parentheses.
-
-    .. code-block:: c++
-
-       void f() {
-         if(true) {
-           f();
-         }
-       }
+    This is **deprecated** and replaced by ``Custom`` below, with all
+    ``SpaceBeforeParensOptions`` but ``AfterPlacementOperator`` set to
+    ``false``.
 
   * ``SBPO_ControlStatements`` (in configuration: ``ControlStatements``)
     Put a space before opening parentheses only after control statement
@@ -5425,32 +5419,14 @@ the configuration (without a prefix: ``Auto``).
        void operator++ (int a);        vs.    void operator++(int a);
        object.operator++ (10);                object.operator++(10);
 
-  * ``AfterPlacementOperatorStyle AfterPlacementOperator`` :versionbadge:`clang-format 18`
-
-    Defines in which cases to put a space between ``new/delete`` operators
-    and opening parentheses.
-
-    Possible values:
-
-    * ``APO_Never`` (in configuration: ``Never``)
-      Remove space after ``new/delete`` operators and before ``(``.
-
-      .. code-block:: c++
-
-         new(buf) T;
-         delete(buf) T;
-
-    * ``APO_Always`` (in configuration: ``Always``)
-      Always add space after ``new/delete`` operators and before ``(``.
+  * ``bool AfterPlacementOperator`` If ``true``, put a space between operator ``new``/``delete`` and opening
+    parenthesis.
 
-      .. code-block:: c++
-
-         new (buf) T;
-         delete (buf) T;
-
-    * ``APO_Leave`` (in configuration: ``Leave``)
-      Leave placement ``new/delete`` expressions as they are.
+    .. code-block:: c++
 
+       true:                                  false:
+       new (buf) T;                    vs.    new(buf) T;
+       delete (buf) T;                        delete(buf) T;
 
   * ``bool AfterRequiresInClause`` If ``true``, put space between requires keyword in a requires clause and
     opening parentheses, if there is one.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 060bc7669b72a5..e533ecfd5aeba5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -58,12 +58,12 @@ code bases.
 
   To reduce such widespread breakages, as an extension, Clang accepts this code
   with an existing warning ``-Wambiguous-reversed-operator`` warning.
-  Fixes `GH <https://github.com/llvm/llvm-project/issues/53954>`_.
+  Fixes `#53954 <https://github.com/llvm/llvm-project/issues/53954>`_.
 
 - The CMake variable ``GCC_INSTALL_PREFIX`` (which sets the default
   ``--gcc-toolchain=``) is deprecated and will be removed. Specify
   ``--gcc-install-dir=`` or ``--gcc-triple=`` in a `configuration file
-  <https://clang.llvm.org/docs/UsersManual.html#configuration-files>` as a
+  <https://clang.llvm.org/docs/UsersManual.html#configuration-files>`_ as a
   replacement.
   (`#77537 <https://github.com/llvm/llvm-project/pull/77537>`_)
 
@@ -95,7 +95,7 @@ C/C++ Language Potentially Breaking Changes
 
 - Fixed a bug in finding matching `operator!=` while adding reversed `operator==` as
   outlined in "The Equality Operator You Are Looking For" (`P2468 <http://wg21.link/p2468r2>`_).
-  Fixes (`#68901: <https://github.com/llvm/llvm-project/issues/68901>`_).
+  Fixes (`#68901 <https://github.com/llvm/llvm-project/issues/68901>`_).
 
 C++ Specific Potentially Breaking Changes
 -----------------------------------------
@@ -139,16 +139,22 @@ C++ Specific Potentially Breaking Changes
 
 - Remove the hardcoded path to the imported modules for C++20 named modules. Now we
   require all the dependent modules to specified from the command line.
-  See (`#62707: <https://github.com/llvm/llvm-project/issues/62707>`_).
+  See (`#62707 <https://github.com/llvm/llvm-project/issues/62707>`_).
 
 - Forbid `import XXX;` in C++ to find module `XXX` comes from explicit clang modules.
-  See (`#64755: <https://github.com/llvm/llvm-project/issues/64755>`_).
+  See (`#64755 <https://github.com/llvm/llvm-project/issues/64755>`_).
 
 ABI Changes in This Version
 ---------------------------
 - Following the SystemV ABI for x86-64, ``__int128`` arguments will no longer
   be split between a register and a stack slot.
 
+- Fixed Microsoft calling convention for returning certain classes with a
+  templated constructor. If a class has a templated constructor, it should
+  be returned indirectly even if it meets all the other requirements for
+  returning a class in a register. This affects some uses of std::pair.
+  (#GH86384).
+
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
 - When dumping a sugared type, Clang will no longer print the desugared type if
@@ -171,6 +177,22 @@ AST Dumping Potentially Breaking Changes
       "qualType": "foo"
     }
 
+Clang Frontend Potentially Breaking Changes
+-------------------------------------------
+- Target OS macros extension
+  A new Clang extension (see :ref:`here <target_os_detail>`) is enabled for
+  Darwin (Apple platform) targets. Clang now defines ``TARGET_OS_*`` macros for
+  these targets, which could break existing code bases with improper checks for
+  the ``TARGET_OS_`` macros. For example, existing checks might fail to include
+  the ``TargetConditionals.h`` header from Apple SDKs and therefore leaving the
+  macros undefined and guarded code unexercised.
+
+  Affected code should be checked to see if it's still intended for the specific
+  target and fixed accordingly.
+
+  The extension can be turned off by the option ``-fno-define-target-os-macros``
+  as a workaround.
+
 What's New in Clang |release|?
 ==============================
 Some of the major new features and improvements to Clang are listed
@@ -183,11 +205,17 @@ C++ Language Changes
 
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
-- Implemented `P1907R1 <https://wg21.link/P1907R1>` which extends allowed non-type template argument
+- Implemented `P1907R1 <https://wg21.link/P1907R1>`_ which extends allowed non-type template argument
   kinds with e.g. floating point values and pointers and references to subobjects.
   This feature is still experimental. Accordingly, ``__cpp_nontype_template_args`` was not updated.
   However, its support can be tested with ``__has_extension(cxx_generalized_nttp)``.
 
+- Clang won't perform ODR checks for decls in the global module fragment any
+  more to ease the implementation and improve the user's using experience.
+  This follows the MSVC's behavior. Users interested in testing the more strict
+  behavior can use the flag '-Xclang -fno-skip-odr-check-in-gmf'.
+  (`#79240 <https://github.com/llvm/llvm-project/issues/79240>`_).
+
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 - Implemented `P0847R7: Deducing this <https://wg21.link/P0847R7>`_. Some related core issues were also
@@ -234,15 +262,10 @@ C++2c Feature Support
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-- Implemented `CWG2137 <https://wg21.link/CWG2137>`_ which allows
-  list-initialization from objects of the same type.
-- Implemented `CWG2311 <https://wg21.link/CWG2311>`_: given a prvalue ``e`` of object type
-  ``T``, ``T{e}`` will try to resolve an initializer list constructor and will use it if successful (CWG2137).
-  Otherwise, if there is no initializer list constructor, the copy will be elided as if it was ``T(e)``.
 
 - Implemented `CWG2598 <https://wg21.link/CWG2598>`_ and `CWG2096 <https://wg21.link/CWG2096>`_,
   making unions (that have either no members or at least one literal member) literal types.
-  (`#77924: <https://github.com/llvm/llvm-project/issues/77924>`_).
+  (`#77924 <https://github.com/llvm/llvm-project/issues/77924>`_).
 
 
 C Language Changes
@@ -303,6 +326,10 @@ Non-comprehensive list of changes in this release
 
 * The version of Unicode used by Clang (primarily to parse identifiers) has been updated to 15.1.
 
+* Clang now defines macro ``__LLVM_INSTR_PROFILE_GENERATE`` when compiling with
+  PGO instrumentation profile generation, and ``__LLVM_INSTR_PROFILE_USE`` when
+  compiling with PGO profile use.
+
 New Compiler Flags
 ------------------
 
@@ -343,6 +370,17 @@ New Compiler Flags
   attribute the replaceable global new and delete operators behave normally
   (like other functions) with respect to visibility attributes, pragmas and
   options (e.g ``--fvisibility=``).
+* Full register names can be used when printing assembly via ``-mregnames``.
+  This option now matches the one used by GCC.
+
+.. _target_os_detail:
+
+* ``-fdefine-target-os-macros`` and its complement
+  ``-fno-define-target-os-macros``. Enables or disables the Clang extension to
+  provide built-in definitions of a list of ``TARGET_OS_*`` macros based on the
+  target triple.
+
+  The extension is enabled by default for Darwin (Apple platform) targets.
 
 Deprecated Compiler Flags
 -------------------------
@@ -362,6 +400,7 @@ Modified Compiler Flags
 * ``-fvisibility-global-new-delete-hidden`` is now a deprecated spelling of
   ``-fvisibility-global-new-delete=force-hidden`` (``-fvisibility-global-new-delete=``
   is new in this release).
+* ``-fprofile-update`` is enabled for ``-fprofile-generate``.
 
 Removed Compiler Flags
 -------------------------
@@ -382,7 +421,7 @@ Attribute Changes in Clang
   types after default argument promotion. As a result, it's no longer an
   automatic diagnostic to use parameters of types that the format style
   supports but that are never the result of default argument promotion, such as
-  ``float``. (`#59824: <https://github.com/llvm/llvm-project/issues/59824>`_)
+  ``float``. (`#59824 <https://github.com/llvm/llvm-project/issues/59824>`_)
 
 - Clang now supports ``[[clang::preferred_type(type-name)]]`` as an attribute
   which can be applied to a bit-field. This attribute helps to map a bit-field
@@ -450,13 +489,13 @@ Improvements to Clang's diagnostics
 - Clang's ``-Wtautological-negation-compare`` flag now diagnoses logical
   tautologies like ``x && !x`` and ``!x || x`` in expressions. This also
   makes ``-Winfinite-recursion`` diagnose more cases.
-  (`#56035: <https://github.com/llvm/llvm-project/issues/56035>`_).
+  (`#56035 <https://github.com/llvm/llvm-project/issues/56035>`_).
 - Clang constexpr evaluator now diagnoses compound assignment operators against
   uninitialized variables as a read of uninitialized object.
   (`#51536 <https://github.com/llvm/llvm-project/issues/51536>`_)
 - Clang's ``-Wformat-truncation`` now diagnoses ``snprintf`` call that is known to
   result in string truncation.
-  (`#64871: <https://github.com/llvm/llvm-project/issues/64871>`_).
+  (`#64871 <https://github.com/llvm/llvm-project/issues/64871>`_).
   Existing warnings that similarly warn about the overflow in ``sprintf``
   now falls under its own warning group ```-Wformat-overflow`` so that it can
   be disabled separately from ``Wfortify-source``.
@@ -472,12 +511,12 @@ Improvements to Clang's diagnostics
 - Clang now emits ``-Wcast-qual`` for functional-style cast expressions.
 - Clang no longer emits irrelevant notes about unsatisfied constraint expressions
   on the left-hand side of ``||`` when the right-hand side constraint is satisfied.
-  (`#54678: <https://github.com/llvm/llvm-project/issues/54678>`_).
+  (`#54678 <https://github.com/llvm/llvm-project/issues/54678>`_).
 - Clang now prints its 'note' diagnostic in cyan instead of black, to be more compatible
   with terminals with dark background colors. This is also more consistent with GCC.
 - Clang now displays an improved diagnostic and a note when a defaulted special
   member is marked ``constexpr`` in a class with a virtual base class
-  (`#64843: <https://github.com/llvm/llvm-project/issues/64843>`_).
+  (`#64843 <https://github.com/llvm/llvm-project/issues/64843>`_).
 - ``-Wfixed-enum-extension`` and ``-Wmicrosoft-fixed-enum`` diagnostics are no longer
   emitted when building as C23, since C23 standardizes support for enums with a
   fixed underlying type.
@@ -517,10 +556,10 @@ Improvements to Clang's diagnostics
     1 | static_assert("A\n"[1] == U'🌍');
       |               ~~~~~~~~~^~~~~~~~
 - Clang now always diagnoses when using non-standard layout types in ``offsetof`` .
-  (`#64619: <https://github.com/llvm/llvm-project/issues/64619>`_)
+  (`#64619 <https://github.com/llvm/llvm-project/issues/64619>`_)
 - Clang now diagnoses redefined defaulted constructor when redefined
   defaulted constructor with different exception specs.
-  (`#69094: <https://github.com/llvm/llvm-project/issues/69094>`_)
+  (`#69094 <https://github.com/llvm/llvm-project/issues/69094>`_)
 - Clang now diagnoses use of variable-length arrays in C++ by default (and
   under ``-Wall`` in GNU++ mode). This is an extension supported by Clang and
   GCC, but is very easy to accidentally use without realizing it's a
@@ -596,7 +635,7 @@ Improvements to Clang's diagnostics
 
 - Clang now diagnoses definitions of friend function specializations, e.g. ``friend void f<>(int) {}``.
 - Clang now diagnoses narrowing conversions involving const references.
-  (`#63151: <https://github.com/llvm/llvm-project/issues/63151>`_).
+  (`#63151 <https://github.com/llvm/llvm-project/issues/63151>`_).
 - Clang now diagnoses unexpanded packs within the template argument lists of function template specializations.
 - The warning `-Wnan-infinity-disabled` is now emitted when ``INFINITY``
   or ``NAN`` are used in arithmetic operations or function arguments in
@@ -606,7 +645,7 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses attempts to bind a bitfield to an NTTP of a reference type as erroneous
   converted constant expression and not as a reference to subobject.
 - Clang now diagnoses ``auto`` and ``decltype(auto)`` in declarations of conversion function template
-  (`CWG1878: <https://cplusplus.github.io/CWG/issues/1878.html>`_)
+  (`CWG1878 <https://cplusplus.github.io/CWG/issues/1878.html>`_)
 - Clang now diagnoses the requirement that non-template friend declarations with requires clauses
   and template friend declarations with a constraint that depends on a template parameter from an
   enclosing template must be a definition.
@@ -637,15 +676,15 @@ Improvements to Clang's diagnostics
 
 - Clang now diagnoses import before module declarations but not in global
   module fragment.
-  (`#67627: <https://github.com/llvm/llvm-project/issues/67627>`_).
+  (`#67627 <https://github.com/llvm/llvm-project/issues/67627>`_).
 
 - Clang now diagnoses include headers with angle in module purviews, which is
   not usually intended.
-  (`#68615: <https://github.com/llvm/llvm-project/issues/68615>`_)
+  (`#68615 <https://github.com/llvm/llvm-project/issues/68615>`_)
 
 - Clang now won't mention invisible namespace when diagnose invisible declarations
   inside namespace. The original diagnostic message is confusing.
-  (`#73893: <https://github.com/llvm/llvm-project/issues/73893>`_)
+  (`#73893 <https://github.com/llvm/llvm-project/issues/73893>`_)
 
 Improvements to Clang's time-trace
 ----------------------------------
@@ -859,6 +898,16 @@ Bug Fixes in This Version
   Fixes (`#78290 <https://github.com/llvm/llvm-project/issues/78290>`_)
 - Fixed assertion failure with deleted overloaded unary operators.
   Fixes (`#78314 <https://github.com/llvm/llvm-project/issues/78314>`_)
+- The XCOFF object file format does not support aliases to symbols having common
+  linkage. Clang now diagnoses the use of an alias for a common symbol when
+  compiling for AIX.
+
+- Clang now doesn't produce false-positive warning `-Wconstant-logical-operand`
+  for logical operators in C23.
+  Fixes (`#64356 <https://github.com/llvm/llvm-project/issues/64356>`_).
+- Clang's ``-Wshadow`` no longer warns when an init-capture is named the same as
+  a class field unless the lambda can capture this.
+  Fixes (`#71976 <https://github.com/llvm/llvm-project/issues/71976>`_)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -923,7 +972,7 @@ Bug Fixes to C++ Support
 
 - Fix a crash when calling a non-constant immediate function
   in the initializer of a static data member.
-  (`#65985 <https://github.com/llvm/llvm-project/issues/65985>_`).
+  (`#65985 <https://github.com/llvm/llvm-project/issues/65985>`_).
 - Clang now properly converts static lambda call operator to function
   pointers on win32.
   (`#62594 <https://github.com/llvm/llvm-project/issues/62594>`_)
@@ -932,7 +981,7 @@ Bug Fixes to C++ Support
   of a function template or a member function of a class template was assigned
   the location of a non-defining declaration rather than the location of the
   definition the specialization was instantiated from.
-  (`#26057 <https://github.com/llvm/llvm-project/issues/26057>`_`)
+  (`#26057 <https://github.com/llvm/llvm-project/issues/26057>`_)
 
 - Fix a crash when a default member initializer of a base aggregate
   makes an invalid call to an immediate function.
@@ -943,11 +992,11 @@ Bug Fixes to C++ Support
   (`#48527 <https://github.com/llvm/llvm-project/issues/48527>`_)
 
 - Clang now no longer asserts when an UnresolvedLookupExpr is used as an
-  expression requirement. (`#66612 https://github.com/llvm/llvm-project/issues/66612`)
+  expression requirement. (`#66612 <https://github.com/llvm/llvm-project/issues/66612>`_)
 
 - Clang now disambiguates NTTP types when printing diagnostics where the
   NTTP types are compared with the 'diff' method.
-  (`#66744 https://github.com/llvm/llvm-project/issues/66744`)
+  (`#66744 <https://github.com/llvm/llvm-project/issues/66744>`_)
 
 - Fix crash caused by a spaceship operator returning a comparision category by
   reference. Fixes:
@@ -1041,9 +1090,6 @@ Bug Fixes to C++ Support
   in different visibility.
   Fixes (`#67893 <https://github.com/llvm/llvm-project/issues/67893>`_)
 
-- Fix a false-positive ODR violation for different definitions for `std::align_val_t`.
-  Fixes (`#76638 <https://github.com/llvm/llvm-project/issues/76638>`_)
-
 - Remove recorded `#pragma once` state for headers included in named modules.
   Fixes (`#77995 <https://github.com/llvm/llvm-project/issues/77995>`_)
 
@@ -1054,6 +1100,25 @@ Bug Fixes to C++ Support
   Fixes (`#78830 <https://github.com/llvm/llvm-project/issues/78830>`_)
   Fixes (`#60085 <https://github.com/llvm/llvm-project/issues/60085>`_)
 
+
+- Fixed a bug where variables referenced by requires-clauses inside
+  nested generic lambdas were not properly injected into the constraint scope.
+  (`#73418 <https://github.com/llvm/llvm-project/issues/73418>`_)
+
+- Fix incorrect code generation caused by the object argument of ``static operator()`` and ``static operator[]`` calls not being evaluated.
+  Fixes (`#67976 <https://github.com/llvm/llvm-project/issues/67976>`_)
+
+- Fix crash when using an immediate-escalated function at global scope.
+  (`#82258 <https://github.com/llvm/llvm-project/issues/82258>`_)
+- Correctly immediate-escalate lambda conversion functions.
+  (`#82258 <https://github.com/llvm/llvm-project/issues/82258>`_)
+- Fix a crash when an unresolved overload set is encountered on the RHS of a ``.*`` operator.
+  (`#53815 <https://github.com/llvm/llvm-project/issues/53815>`_)
+
+- Fixed a regression in CTAD that a friend declaration that befriends itself may cause
+  incorrect constraint substitution.
+  (`#86769 <https://github.com/llvm/llvm-project/issues/86769>`_)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
@@ -1164,6 +1229,17 @@ Arm and AArch64 Support
   * Cortex-A720 (cortex-a720).
   * Cortex-X4 (cortex-x4).
 
+- Alpha support has been added for SVE2.1 intrinsics.
+
+- Support has been added for `-fstack-clash-protection` and `-mstack-probe-size`
+  command line options.
+
+- Function Multi Versioning has been extended to support Load-Acquire RCpc
+  instructions v3 (rcpc3) as well as Memory Copy and Memory Set Acceleration
+  instructions (mops) when targeting AArch64. The feature identifiers (in
+  parenthesis) can be used with either of the ``target_version`` and
+  ``target_clones`` attributes.
+
 Android Support
 ^^^^^^^^^^^^^^^
 
@@ -1195,6 +1271,8 @@ Windows Support
   linking may succeed but the resulting executables may expose issues at
   runtime.
 
+- Clang now passes relevant LTO options to the linker (LLD) in MinGW mode.
+
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 - Added builtins support for all LSX (128-bits SIMD) and LASX (256-bits SIMD)
@@ -1227,6 +1305,9 @@ RISC-V Support
 - Default ABI with F but without D was changed to ilp32f for RV32 and to lp64f
   for RV64.
 
+- ``__attribute__((rvv_vector_bits(N)))`` is now supported for RVV vbool*_t types.
+- ``-mtls-dialect=desc`` is now supported to enable TLS descriptors (TLSDESC).
+
 CUDA/HIP Language Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -1236,6 +1317,16 @@ CUDA Support
 - Clang now supports CUDA SDK up to 12.3
 - Added support for sm_90a
 
+PowerPC Support
+^^^^^^^^^^^^^^^
+
+- Added ``nmmintrin.h`` to intrinsics headers.
+- Added ``__builtin_ppc_fence`` as barrier of code motion, and
+  ``__builtin_ppc_mffsl`` for corresponding instruction.
+- Supported ``__attribute__((target("tune=cpu")))``.
+- Emit ``float-abi`` module flag on 64-bit ELFv2 PowerPC targets if
+  ``long double`` type is used in current module.
+
 AIX Support
 ^^^^^^^^^^^
 
@@ -1244,6 +1335,15 @@ AIX Support
   base is encoded as an immediate operand.
   This access sequence is not used for TLS variables larger than 32KB, and is
   currently only supported on 64-bit mode.
+- Inline assembler supports VSR register in pure digits.
+- Enabled ThinLTO support. Requires AIX 7.2 TL5 SP7 or newer, or AIX 7.3 TL2
+  or newer. Similar to the LTO support on AIX, ThinLTO is implemented with
+  the libLTO.so plugin.
+
+SystemZ Support
+^^^^^^^^^^^^^^^
+- Properly support 16 byte atomic int/fp types and ops. Atomic __int128 (and
+  long double) variables are now aligned to 16 bytes by default (like gcc 14).
 
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
@@ -1307,6 +1407,8 @@ libclang
 - Exposed arguments of ``clang::annotate``.
 - ``clang::getCursorKindForDecl`` now recognizes linkage specifications such as
   ``extern "C"`` and reports them as ``CXCursor_LinkageSpec``.
+- Changed the libclang library on AIX to export only the necessary symbols to
+  prevent issues of resolving to the wrong duplicate symbol.
 
 Static Analyzer
 ---------------
@@ -1318,9 +1420,6 @@ New features
   of static analysis tools, such as the Clang Static Analyzer.
   `Documentation <https://clang.llvm.org/docs/AttributeReference.html#suppress>`__.
 
-- Added support for the ``cleanup`` attribute.
-  `Documentation <https://clang.llvm.org/docs/AttributeReference.html#cleanup>`__.
-
 - Support "Deducing this" (P0847R7). (Worked out of the box)
   (`af4751738db8 <https://github.com/llvm/llvm-project/commit/af4751738db89a142a8880c782d12d4201b222a8>`__)
 
@@ -1381,6 +1480,10 @@ Crash and bug fixes
 - Fix false positive in mutation check when using pointer to member function.
   (`#66204 <https://github.com/llvm/llvm-project/issues/66204>`_)
 
+- Fixed a crash in ``security.cert.env.InvalidPtr`` checker when accidentally
+  matched user-defined ``strerror`` and similar library functions.
+  (`#88181 <https://github.com/llvm/llvm-project/issues/88181>`_)
+
 Improvements
 ^^^^^^^^^^^^
 
diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index 81043ff25be02e..0f85065f464a8f 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -457,6 +457,29 @@ Note that **currently** the compiler doesn't consider inconsistent macro definit
 Currently Clang would accept the above example. But it may produce surprising results if the
 debugging code depends on consistent use of ``NDEBUG`` also in other translation units.
 
+Definitions consistency
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The C++ language defines that same declarations in different translation units should have
+the same definition, as known as ODR (One Definition Rule). Prior to modules, the translation
+units don't dependent on each other and the compiler itself can't perform a strong
+ODR violation check. With the introduction of modules, now the compiler have
+the chance to perform ODR violations with language semantics across translation units.
+
+However, in the practice, we found the existing ODR checking mechanism is not stable
+enough. Many people suffers from the false positive ODR violation diagnostics, AKA,
+the compiler are complaining two identical declarations have different definitions
+incorrectly. Also the true positive ODR violations are rarely reported.
+Also we learned that MSVC don't perform ODR check for declarations in the global module
+fragment.
+
+So in order to get better user experience, save the time checking ODR and keep consistent
+behavior with MSVC, we disabled the ODR check for the declarations in the global module
+fragment by default. Users who want more strict check can still use the
+``-Xclang -fno-skip-odr-check-in-gmf`` flag to get the ODR check enabled. It is also
+encouraged to report issues if users find false positive ODR violations or false negative ODR
+violations with the flag enabled.
+
 ABI Impacts
 -----------
 
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 9a4736019d1b1b..eb7a1a32060077 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -673,6 +673,16 @@ class alignas(8) Decl {
   /// fragment. See [module.global.frag]p3,4 for details.
   bool isDiscardedInGlobalModuleFragment() const { return false; }
 
+  /// Check if we should skip checking ODRHash for declaration \param D.
+  ///
+  /// The existing ODRHash mechanism seems to be not stable enough and
+  /// the false positive ODR violation reports are annoying and we rarely see
+  /// true ODR violation reports. Also we learned that MSVC disabled ODR checks
+  /// for declarations in GMF. So we try to disable ODR checks in the GMF to
+  /// get better user experiences before we make the ODR violation checks stable
+  /// enough.
+  bool shouldSkipCheckingODR() const;
+
   /// Return true if this declaration has an attribute which acts as
   /// definition of the entity, such as 'alias' or 'ifunc'.
   bool hasDefiningAttr() const;
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index ea425791fc97f0..6384cf9420b82e 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -3495,6 +3495,9 @@ enum class VectorKind {
 
   /// is RISC-V RVV fixed-length data vector
   RVVFixedLengthData,
+
+  /// is RISC-V RVV fixed-length mask vector
+  RVVFixedLengthMask,
 };
 
 /// Represents a GCC generic vector type. This type is created using
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 58838b01b4fd7c..dbf2dd2120fb69 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1590,6 +1590,7 @@ def RegCall : DeclOrTypeAttr {
 }
 
 def Final : InheritableAttr {
+  let CanPrintOnLeft = 0;
   let Spellings = [CustomKeyword<"final">, CustomKeyword<"sealed">];
   let Accessors = [Accessor<"isSpelledAsSealed", [CustomKeyword<"sealed">]>];
   let SemaHandler = 0;
@@ -2472,6 +2473,7 @@ def Overloadable : Attr {
 }
 
 def Override : InheritableAttr {
+  let CanPrintOnLeft = 0;
   let Spellings = [CustomKeyword<"override">];
   let SemaHandler = 0;
   // Omitted from docs, since this is language syntax, not an attribute, as far
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 7e633f8e2635a9..e02a1201e2ad79 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2424,7 +2424,10 @@ only be a power of 2 between 64 and 65536.
 For types where LMUL!=1, ``__riscv_v_fixed_vlen`` needs to be scaled by the LMUL
 of the type before passing to the attribute.
 
-``vbool*_t`` types are not supported at this time.
+For ``vbool*_t`` types, ``__riscv_v_fixed_vlen`` needs to be divided by the
+number from the type name. For example, ``vbool8_t`` needs to use
+``__riscv_v_fixed_vlen`` / 8. If the resulting value is not a multiple of 8,
+the type is not supported for that value of ``__riscv_v_fixed_vlen``.
 }];
 }
 
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index d208342d9c516e..74dfd1d214e849 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -436,5 +436,67 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,w
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
 
+//===----------------------------------------------------------------------===//
+// WMMA builtins.
+// Postfix w32 indicates the builtin requires wavefront size of 32.
+// Postfix w64 indicates the builtin requires wavefront size of 64.
+//
+// Some of these are very similar to their GFX11 counterparts, but they don't
+// require replication of the A,B matrices, so they use fewer vector elements.
+// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
+// builtins.
+//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 2f2e45d5cf63df..7c0bfe32849614 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -369,6 +369,9 @@ ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibr
 /// The default TLS model to use.
 ENUM_CODEGENOPT(DefaultTLSModel, TLSModel, 2, GeneralDynamicTLSModel)
 
+/// Whether to enable TLSDESC. AArch64 enables TLSDESC regardless of this value.
+CODEGENOPT(EnableTLSDESC, 1, 0)
+
 /// Bit size of immediate TLS offsets (0 == use the default).
 VALUE_CODEGENOPT(TLSSize, 8, 0)
 
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index b1bada65cb6b28..08bb1d81ba29f1 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -73,7 +73,7 @@ def warn_pragma_debug_unexpected_argument : Warning<
 def warn_fp_nan_inf_when_disabled : Warning<
   "use of %select{infinity|NaN}0%select{| via a macro}1 is undefined behavior "
   "due to the currently enabled floating-point options">,
-  InGroup<DiagGroup<"nan-infinity-disabled">>;
+  InGroup<DiagGroup<"nan-infinity-disabled", [], NanInfDisabledDocs>>;
 }
 
 // Parse && Sema
diff --git a/clang/include/clang/Basic/DiagnosticDocs.td b/clang/include/clang/Basic/DiagnosticDocs.td
index e9862422b4997e..8c024b5cad740a 100644
--- a/clang/include/clang/Basic/DiagnosticDocs.td
+++ b/clang/include/clang/Basic/DiagnosticDocs.td
@@ -87,3 +87,12 @@ program by treating all string literals as having type ``const char *``
 instead of ``char *``. This can cause unexpected behaviors with type-sensitive
 constructs like ``_Generic``.
 }];
+
+defvar NanInfDisabledDocs = [{
+This warning is enabled when source code using the macros ``INFINITY`` or ``NAN``
+is compiled with floating-point options preventing these two values. This can
+lead to undefined behavior. Check the order of command line arguments that modify
+this behavior, such as ``-ffast-math``, ``-fhonor-infinities``, and
+``-fhonor-nans`` (etc), as well as ``#pragma`` directives if this diagnostic is
+generated unexpectedly.
+}];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a1c32abb4dcd88..ef8c111b1d8cc8 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3711,6 +3711,12 @@ def err_sme_za_call_no_za_state : Error<
   "call to a shared ZA function requires the caller to have ZA state">;
 def err_sme_zt0_call_no_zt0_state : Error<
   "call to a shared ZT0 function requires the caller to have ZT0 state">;
+def err_sme_unimplemented_za_save_restore : Error<
+  "call to a function that shares state other than 'za' from a "
+  "function that has live 'za' state requires a spill/fill of ZA, which is not yet "
+  "implemented">;
+def note_sme_use_preserves_za : Note<
+  "add '__arm_preserves(\"za\")' to the callee if it preserves ZA">;
 def err_sme_definition_using_sm_in_non_sme_target : Error<
   "function executed in streaming-SVE mode requires 'sme'">;
 def err_sme_definition_using_za_in_non_sme_target : Error<
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8fc75e1cca0399..4942dcaa086eac 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -174,6 +174,7 @@ LANGOPT(MathErrno         , 1, 1, "errno in math functions")
 BENIGN_LANGOPT(HeinousExtensions , 1, 0, "extensions that we really don't like and may be ripped out at any time")
 LANGOPT(Modules           , 1, 0, "modules semantics")
 COMPATIBLE_LANGOPT(CPlusPlusModules, 1, 0, "C++ modules syntax")
+LANGOPT(SkipODRCheckInGMF, 1, 0, "Skip ODR checks for decls in the global module fragment")
 LANGOPT(BuiltinHeadersInSystemModules, 1, 0, "builtin headers belong to system modules, and _Builtin_ modules are ignored for cstdlib headers")
 BENIGN_ENUM_LANGOPT(CompilingModule, CompilingModuleKind, 3, CMK_None,
                     "compiling a module interface")
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 695e1bddf9ffc6..2da0e8d2aba9a4 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -44,6 +44,7 @@ defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0
 defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>;
 defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>;
 
+let TargetGuard = "sme" in {
 def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "",
                           [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                           MemEltTyDefault, "aarch64_sme_ldr">;
@@ -51,6 +52,7 @@ def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "",
 def SVLDR_ZA : MInst<"svldr_za", "vmQ", "",
                           [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                           MemEltTyDefault, "aarch64_sme_ldr", []>;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Stores
@@ -81,6 +83,7 @@ defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck
 defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>]>;
 defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>;
 
+let TargetGuard = "sme" in {
 def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "",
                           [IsOverloadNone, IsStreamingCompatible, IsInZA],
                           MemEltTyDefault, "aarch64_sme_str">;
@@ -88,6 +91,7 @@ def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "",
 def SVSTR_ZA : MInst<"svstr_za", "vm%", "",
                       [IsOverloadNone, IsStreamingCompatible, IsInZA],
                       MemEltTyDefault, "aarch64_sme_str", []>;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Read horizontal/vertical ZA slices
@@ -277,22 +281,22 @@ multiclass ZAAddSub<string n_suffix> {
 
     def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
     def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  }
 
-    let TargetGuard = "sme-i16i64" in {
-      def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>;
-      def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>;
+  let TargetGuard = "sme2,sme-i16i64" in {
+    def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>;
 
-      def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>;
-      def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>;
+    def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>;
 
-      def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
-      def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
-    }
+    def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
+  }
 
-    let TargetGuard = "sme-f64f64" in {
-      def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
-      def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
-    }
+  let TargetGuard = "sme2,sme-f64f64" in {
+    def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
   }
 }
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7f4fa33748faca..175bedbfb4d01c 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2985,6 +2985,14 @@ def fmodule_output : Flag<["-"], "fmodule-output">, Flags<[NoXarchOption]>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Save intermediate module file results when compiling a standard C++ module unit.">;
 
+defm skip_odr_check_in_gmf : BoolOption<"f", "skip-odr-check-in-gmf",
+  LangOpts<"SkipODRCheckInGMF">, DefaultFalse,
+  PosFlag<SetTrue, [], [CC1Option],
+          "Skip ODR checks for decls in the global module fragment.">,
+  NegFlag<SetFalse, [], [CC1Option],
+          "Perform ODR checks for decls in the global module fragment.">>,
+  Group<f_Group>;
+
 def fmodules_prune_interval : Joined<["-"], "fmodules-prune-interval=">, Group<i_Group>,
   Visibility<[ClangOption, CC1Option]>, MetaVarName<"<seconds>">,
   HelpText<"Specify the interval (in seconds) between attempts to prune the module cache">,
@@ -4419,6 +4427,8 @@ def mtls_size_EQ : Joined<["-"], "mtls-size=">, Group<m_Group>,
   HelpText<"Specify bit size of immediate TLS offsets (AArch64 ELF only): "
            "12 (for 4KB) | 24 (for 16MB, default) | 32 (for 4GB) | 48 (for 256TB, needs -mcmodel=large)">,
   MarshallingInfoInt<CodeGenOpts<"TLSSize">>;
+def mtls_dialect_EQ : Joined<["-"], "mtls-dialect=">, Group<m_Group>,
+  Flags<[TargetSpecific]>, HelpText<"Which thread-local storage dialect to use for dynamic accesses of TLS variables">;
 def mimplicit_it_EQ : Joined<["-"], "mimplicit-it=">, Group<m_Group>;
 def mdefault_build_attributes : Joined<["-"], "mdefault-build-attributes">, Group<m_Group>;
 def mno_default_build_attributes : Joined<["-"], "mno-default-build-attributes">, Group<m_Group>;
@@ -5805,6 +5815,18 @@ def mvis3 : Flag<["-"], "mvis3">, Group<m_sparc_Features_Group>;
 def mno_vis3 : Flag<["-"], "mno-vis3">, Group<m_sparc_Features_Group>;
 def mhard_quad_float : Flag<["-"], "mhard-quad-float">, Group<m_sparc_Features_Group>;
 def msoft_quad_float : Flag<["-"], "msoft-quad-float">, Group<m_sparc_Features_Group>;
+foreach i = 1 ... 7 in
+  def ffixed_g#i : Flag<["-"], "ffixed-g"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the G"#i#" register (SPARC only)">;
+foreach i = 0 ... 5 in
+  def ffixed_o#i : Flag<["-"], "ffixed-o"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the O"#i#" register (SPARC only)">;
+foreach i = 0 ... 7 in
+  def ffixed_l#i : Flag<["-"], "ffixed-l"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the L"#i#" register (SPARC only)">;
+foreach i = 0 ... 5 in
+  def ffixed_i#i : Flag<["-"], "ffixed-i"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the I"#i#" register (SPARC only)">;
 } // let Flags = [TargetSpecific]
 
 // M68k features flags
@@ -7066,6 +7088,9 @@ def fexperimental_assignment_tracking_EQ : Joined<["-"], "fexperimental-assignme
   Values<"disabled,enabled,forced">, NormalizedValues<["Disabled","Enabled","Forced"]>,
   MarshallingInfoEnum<CodeGenOpts<"AssignmentTrackingMode">, "Enabled">;
 
+def enable_tlsdesc : Flag<["-"], "enable-tlsdesc">,
+  MarshallingInfoFlag<CodeGenOpts<"EnableTLSDESC">>;
+
 } // let Visibility = [CC1Option]
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index bc9eecd42f9ebf..e61619e90317ee 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4157,14 +4157,9 @@ struct FormatStyle {
 
   /// Different ways to put a space before opening parentheses.
   enum SpaceBeforeParensStyle : int8_t {
-    /// Never put a space before opening parentheses.
-    /// \code
-    ///    void f() {
-    ///      if(true) {
-    ///        f();
-    ///      }
-    ///    }
-    /// \endcode
+    /// This is **deprecated** and replaced by ``Custom`` below, with all
+    /// ``SpaceBeforeParensOptions`` but ``AfterPlacementOperator`` set to
+    /// ``false``.
     SBPO_Never,
     /// Put a space before opening parentheses only after control statement
     /// keywords (``for/if/while...``).
@@ -4273,28 +4268,14 @@ struct FormatStyle {
     ///    object.operator++ (10);                object.operator++(10);
     /// \endcode
     bool AfterOverloadedOperator;
-    /// Styles for adding spacing between ``new/delete`` operators and opening
-    /// parentheses.
-    enum AfterPlacementOperatorStyle : int8_t {
-      /// Remove space after ``new/delete`` operators and before ``(``.
-      /// \code
-      ///    new(buf) T;
-      ///    delete(buf) T;
-      /// \endcode
-      APO_Never,
-      /// Always add space after ``new/delete`` operators and before ``(``.
-      /// \code
-      ///    new (buf) T;
-      ///    delete (buf) T;
-      /// \endcode
-      APO_Always,
-      /// Leave placement ``new/delete`` expressions as they are.
-      APO_Leave,
-    };
-    /// Defines in which cases to put a space between ``new/delete`` operators
-    /// and opening parentheses.
-    /// \version 18
-    AfterPlacementOperatorStyle AfterPlacementOperator;
+    /// If ``true``, put a space between operator ``new``/``delete`` and opening
+    /// parenthesis.
+    /// \code
+    ///    true:                                  false:
+    ///    new (buf) T;                    vs.    new(buf) T;
+    ///    delete (buf) T;                        delete(buf) T;
+    /// \endcode
+    bool AfterPlacementOperator;
     /// If ``true``, put space between requires keyword in a requires clause and
     /// opening parentheses, if there is one.
     /// \code
@@ -4327,7 +4308,7 @@ struct FormatStyle {
         : AfterControlStatements(false), AfterForeachMacros(false),
           AfterFunctionDeclarationName(false),
           AfterFunctionDefinitionName(false), AfterIfMacros(false),
-          AfterOverloadedOperator(false), AfterPlacementOperator(APO_Leave),
+          AfterOverloadedOperator(false), AfterPlacementOperator(true),
           AfterRequiresInClause(false), AfterRequiresInExpression(false),
           BeforeNonEmptyParentheses(false) {}
 
@@ -5212,7 +5193,8 @@ llvm::Expected<FormatStyle> getStyle(StringRef StyleName, StringRef FileName,
                                      StringRef FallbackStyle,
                                      StringRef Code = "",
                                      llvm::vfs::FileSystem *FS = nullptr,
-                                     bool AllowUnknownOptions = false);
+                                     bool AllowUnknownOptions = false,
+                                     llvm::SourceMgr::DiagHandlerTy DiagHandler = nullptr);
 
 // Guesses the language from the ``FileName`` and ``Code`` to be formatted.
 // Defaults to FormatStyle::LK_Cpp.
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 2d9c53cdf5bde8..b0a8ec0fec5e94 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2828,7 +2828,8 @@ class Preprocessor {
     return AnnotationInfos.find(II)->second;
   }
 
-  void emitMacroExpansionWarnings(const Token &Identifier) const {
+  void emitMacroExpansionWarnings(const Token &Identifier,
+                                  bool IsIfnDef = false) const {
     IdentifierInfo *Info = Identifier.getIdentifierInfo();
     if (Info->isDeprecatedMacro())
       emitMacroDeprecationWarning(Identifier);
@@ -2837,12 +2838,12 @@ class Preprocessor {
         !SourceMgr.isInMainFile(Identifier.getLocation()))
       emitRestrictExpansionWarning(Identifier);
 
-    if (Info->getName() == "INFINITY")
-      if (getLangOpts().NoHonorInfs)
+    if (!IsIfnDef) {
+      if (Info->getName() == "INFINITY" && getLangOpts().NoHonorInfs)
         emitRestrictInfNaNWarning(Identifier, 0);
-    if (Info->getName() == "NAN")
-      if (getLangOpts().NoHonorNaNs)
+      if (Info->getName() == "NAN" && getLangOpts().NoHonorNaNs)
         emitRestrictInfNaNWarning(Identifier, 1);
+    }
   }
 
   static void processPathForFileMacro(SmallVectorImpl<char> &Path,
diff --git a/clang/include/clang/Sema/Lookup.h b/clang/include/clang/Sema/Lookup.h
index 9c93bf1e6fb428..2f2f2607a937fe 100644
--- a/clang/include/clang/Sema/Lookup.h
+++ b/clang/include/clang/Sema/Lookup.h
@@ -754,7 +754,8 @@ class LookupResult {
 
 private:
   void diagnoseAccess() {
-    if (isClassLookup() && getSema().getLangOpts().AccessControl)
+    if (!isAmbiguous() && isClassLookup() &&
+        getSema().getLangOpts().AccessControl)
       getSema().CheckLookupAccess(*this);
   }
 
diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h
index 6eaa74382685ba..06e47eed4e93b6 100644
--- a/clang/include/clang/Sema/ScopeInfo.h
+++ b/clang/include/clang/Sema/ScopeInfo.h
@@ -925,8 +925,8 @@ class LambdaScopeInfo final :
   /// that were defined in parent contexts. Used to avoid warnings when the
   /// shadowed variables are uncaptured by this lambda.
   struct ShadowedOuterDecl {
-    const VarDecl *VD;
-    const VarDecl *ShadowedDecl;
+    const NamedDecl *VD;
+    const NamedDecl *ShadowedDecl;
   };
   llvm::SmallVector<ShadowedOuterDecl, 4> ShadowingDecls;
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 1f1cbd11ff7358..6adb8fb7966b3f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -1090,7 +1090,9 @@ class Sema final {
       if (FD) {
         FD->setWillHaveBody(true);
         S.ExprEvalContexts.back().InImmediateFunctionContext =
-            FD->isImmediateFunction();
+            FD->isImmediateFunction() ||
+            S.ExprEvalContexts[S.ExprEvalContexts.size() - 2]
+                .isConstantEvaluated();
         S.ExprEvalContexts.back().InImmediateEscalatingFunctionContext =
             S.getLangOpts().CPlusPlus20 && FD->isImmediateEscalating();
       } else
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index dd1451bbf2d2c9..62c25f5b7a0df8 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2451,7 +2451,6 @@ class BitsUnpacker {
   uint32_t Value;
   uint32_t CurrentBitsIndex = ~0;
 };
-
 } // namespace clang
 
 #endif // LLVM_CLANG_SERIALIZATION_ASTREADER_H
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 5eb7aa3664569d..cc5de9a6295ebf 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -498,7 +498,11 @@ void ASTContext::attachCommentsToJustParsedDecls(ArrayRef<Decl *> Decls,
     return;
 
   FileID File;
-  for (Decl *D : Decls) {
+  for (const Decl *D : Decls) {
+    if (D->isInvalidDecl())
+      continue;
+
+    D = &adjustDeclToTemplate(*D);
     SourceLocation Loc = D->getLocation();
     if (Loc.isValid()) {
       // See if there are any new comments that are not attached to a decl.
@@ -1945,7 +1949,8 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     else if (VT->getVectorKind() == VectorKind::SveFixedLengthPredicate)
       // Adjust the alignment for fixed-length SVE predicates.
       Align = 16;
-    else if (VT->getVectorKind() == VectorKind::RVVFixedLengthData)
+    else if (VT->getVectorKind() == VectorKind::RVVFixedLengthData ||
+             VT->getVectorKind() == VectorKind::RVVFixedLengthMask)
       // Adjust the alignment for fixed-length RVV vectors.
       Align = std::min<unsigned>(64, Width);
     break;
@@ -9416,7 +9421,9 @@ bool ASTContext::areCompatibleVectorTypes(QualType FirstVec,
       Second->getVectorKind() != VectorKind::SveFixedLengthData &&
       Second->getVectorKind() != VectorKind::SveFixedLengthPredicate &&
       First->getVectorKind() != VectorKind::RVVFixedLengthData &&
-      Second->getVectorKind() != VectorKind::RVVFixedLengthData)
+      Second->getVectorKind() != VectorKind::RVVFixedLengthData &&
+      First->getVectorKind() != VectorKind::RVVFixedLengthMask &&
+      Second->getVectorKind() != VectorKind::RVVFixedLengthMask)
     return true;
 
   return false;
@@ -9522,8 +9529,11 @@ static uint64_t getRVVTypeSize(ASTContext &Context, const BuiltinType *Ty) {
 
   ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(Ty);
 
-  uint64_t EltSize = Context.getTypeSize(Info.ElementType);
-  uint64_t MinElts = Info.EC.getKnownMinValue();
+  unsigned EltSize = Context.getTypeSize(Info.ElementType);
+  if (Info.ElementType == Context.BoolTy)
+    EltSize = 1;
+
+  unsigned MinElts = Info.EC.getKnownMinValue();
   return VScale->first * MinElts * EltSize;
 }
 
@@ -9537,6 +9547,12 @@ bool ASTContext::areCompatibleRVVTypes(QualType FirstType,
   auto IsValidCast = [this](QualType FirstType, QualType SecondType) {
     if (const auto *BT = FirstType->getAs<BuiltinType>()) {
       if (const auto *VT = SecondType->getAs<VectorType>()) {
+        if (VT->getVectorKind() == VectorKind::RVVFixedLengthMask) {
+          BuiltinVectorTypeInfo Info = getBuiltinVectorTypeInfo(BT);
+          return FirstType->isRVVVLSBuiltinType() &&
+                 Info.ElementType == BoolTy &&
+                 getTypeSize(SecondType) == getRVVTypeSize(*this, BT);
+        }
         if (VT->getVectorKind() == VectorKind::RVVFixedLengthData ||
             VT->getVectorKind() == VectorKind::Generic)
           return FirstType->isRVVVLSBuiltinType() &&
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 26fdfa040796ed..1ee33fd7576d7d 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4476,7 +4476,7 @@ unsigned FunctionDecl::getODRHash() {
   }
 
   class ODRHash Hash;
-  Hash.AddFunctionDecl(this);
+  Hash.AddFunctionDecl(this, /*SkipBody=*/shouldSkipCheckingODR());
   setHasODRHash(true);
   ODRHash = Hash.CalculateHash();
   return ODRHash;
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 8163f9bdaf8d97..6b3c13ff206d23 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1102,6 +1102,11 @@ bool Decl::isInAnotherModuleUnit() const {
   return M != getASTContext().getCurrentNamedModule();
 }
 
+bool Decl::shouldSkipCheckingODR() const {
+  return getASTContext().getLangOpts().SkipODRCheckInGMF && getOwningModule() &&
+         getOwningModule()->isExplicitGlobalModule();
+}
+
 static Decl::Kind getKind(const Decl *D) { return D->getKind(); }
 static Decl::Kind getKind(const DeclContext *DC) { return DC->getDeclKind(); }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index f1d07d022b2584..edf9b5e2d52bb3 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -7951,7 +7951,8 @@ class ExprEvaluatorBase
       // Overloaded operator calls to member functions are represented as normal
       // calls with '*this' as the first argument.
       const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD);
-      if (MD && MD->isImplicitObjectMemberFunction()) {
+      if (MD &&
+          (MD->isImplicitObjectMemberFunction() || (OCE && MD->isStatic()))) {
         // FIXME: When selecting an implicit conversion for an overloaded
         // operator delete, we sometimes try to evaluate calls to conversion
         // operators without a 'this' parameter!
@@ -7960,7 +7961,11 @@ class ExprEvaluatorBase
 
         if (!EvaluateObjectArgument(Info, Args[0], ThisVal))
           return false;
-        This = &ThisVal;
+
+        // If we are calling a static operator, the 'this' argument needs to be
+        // ignored after being evaluated.
+        if (MD->isInstance())
+          This = &ThisVal;
 
         // If this is syntactically a simple assignment using a trivial
         // assignment operator, start the lifetimes of union members as needed,
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 40b1e086ddd0c6..688141b30441e8 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3994,7 +3994,8 @@ void CXXNameMangler::mangleAArch64FixedSveVectorType(
 }
 
 void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) {
-  assert(T->getVectorKind() == VectorKind::RVVFixedLengthData &&
+  assert((T->getVectorKind() == VectorKind::RVVFixedLengthData ||
+          T->getVectorKind() == VectorKind::RVVFixedLengthMask) &&
          "expected fixed-length RVV vector!");
 
   QualType EltType = T->getElementType();
@@ -4009,7 +4010,10 @@ void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) {
     TypeNameOS << "int8";
     break;
   case BuiltinType::UChar:
-    TypeNameOS << "uint8";
+    if (T->getVectorKind() == VectorKind::RVVFixedLengthData)
+      TypeNameOS << "uint8";
+    else
+      TypeNameOS << "bool";
     break;
   case BuiltinType::Short:
     TypeNameOS << "int16";
@@ -4048,12 +4052,16 @@ void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) {
   auto VScale = getASTContext().getTargetInfo().getVScaleRange(
       getASTContext().getLangOpts());
   unsigned VLen = VScale->first * llvm::RISCV::RVVBitsPerBlock;
-  TypeNameOS << 'm';
-  if (VecSizeInBits >= VLen)
-    TypeNameOS << (VecSizeInBits / VLen);
-  else
-    TypeNameOS << 'f' << (VLen / VecSizeInBits);
 
+  if (T->getVectorKind() == VectorKind::RVVFixedLengthData) {
+    TypeNameOS << 'm';
+    if (VecSizeInBits >= VLen)
+      TypeNameOS << (VecSizeInBits / VLen);
+    else
+      TypeNameOS << 'f' << (VLen / VecSizeInBits);
+  } else {
+    TypeNameOS << (VLen / VecSizeInBits);
+  }
   TypeNameOS << "_t";
 
   Out << "9__RVV_VLSI" << 'u' << TypeNameStr.size() << TypeNameStr << "Lj"
@@ -4093,7 +4101,8 @@ void CXXNameMangler::mangleType(const VectorType *T) {
              T->getVectorKind() == VectorKind::SveFixedLengthPredicate) {
     mangleAArch64FixedSveVectorType(T);
     return;
-  } else if (T->getVectorKind() == VectorKind::RVVFixedLengthData) {
+  } else if (T->getVectorKind() == VectorKind::RVVFixedLengthData ||
+             T->getVectorKind() == VectorKind::RVVFixedLengthMask) {
     mangleRISCVFixedRVVVectorType(T);
     return;
   }
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 3daba13d0fce7b..3c11b75d7472d9 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -703,6 +703,9 @@ void JSONNodeDumper::VisitVectorType(const VectorType *VT) {
   case VectorKind::RVVFixedLengthData:
     JOS.attribute("vectorKind", "fixed-length rvv data vector");
     break;
+  case VectorKind::RVVFixedLengthMask:
+    JOS.attribute("vectorKind", "fixed-length rvv mask vector");
+    break;
   }
 }
 
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index 5b98646a1e8dc0..2dbc259138a897 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -745,55 +745,8 @@ void ODRHash::AddEnumDecl(const EnumDecl *Enum) {
   if (Enum->isScoped())
     AddBoolean(Enum->isScopedUsingClassTag());
 
-  if (Enum->getIntegerTypeSourceInfo()) {
-    // FIMXE: This allows two enums with different spellings to have the same
-    // hash.
-    //
-    //  // mod1.cppm
-    //  module;
-    //  extern "C" {
-    //      typedef unsigned __int64 size_t;
-    //  }
-    //  namespace std {
-    //      using :: size_t;
-    //  }
-    //
-    //  extern "C++" {
-    //      namespace std {
-    //          enum class align_val_t : std::size_t {};
-    //      }
-    //  }
-    //
-    //  export module mod1;
-    //  export using std::align_val_t;
-    //
-    //  // mod2.cppm
-    //  module;
-    //  extern "C" {
-    //      typedef unsigned __int64 size_t;
-    //  }
-    //
-    //  extern "C++" {
-    //      namespace std {
-    //          enum class align_val_t : size_t {};
-    //      }
-    //  }
-    //
-    //  export module mod2;
-    //  import mod1;
-    //  export using std::align_val_t;
-    //
-    // The above example should be disallowed since it violates
-    // [basic.def.odr]p14:
-    //
-    //    Each such definition shall consist of the same sequence of tokens
-    //
-    // The definitions of `std::align_val_t` in two module units have different
-    // spellings but we failed to give an error here.
-    //
-    // See https://github.com/llvm/llvm-project/issues/76638 for details.
+  if (Enum->getIntegerTypeSourceInfo())
     AddQualType(Enum->getIntegerType().getCanonicalType());
-  }
 
   // Filter out sub-Decls which will not be processed in order to get an
   // accurate count of Decl's.
diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index 2bdbeb08ef2046..3310d7dc24c59d 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -450,7 +450,8 @@ bool TemplateArgument::structurallyEquals(const TemplateArgument &Other) const {
            getAsIntegral() == Other.getAsIntegral();
 
   case StructuralValue: {
-    if (getStructuralValueType() != Other.getStructuralValueType())
+    if (getStructuralValueType().getCanonicalType() !=
+        Other.getStructuralValueType().getCanonicalType())
       return false;
 
     llvm::FoldingSetNodeID A, B;
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 48c6729a673819..ecf5de0be543d7 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -1623,6 +1623,9 @@ void TextNodeDumper::VisitVectorType(const VectorType *T) {
   case VectorKind::RVVFixedLengthData:
     OS << " fixed-length rvv data vector";
     break;
+  case VectorKind::RVVFixedLengthMask:
+    OS << " fixed-length rvv mask vector";
+    break;
   }
   OS << " " << T->getNumElements();
 }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 3db5ae182f32c4..d4103025591e73 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2479,6 +2479,9 @@ bool Type::isRVVVLSBuiltinType() const {
                         IsFP, IsBF)                                            \
   case BuiltinType::Id:                                                        \
     return NF == 1;
+#define RVV_PREDICATE_TYPE(Name, Id, SingletonId, NumEls)                      \
+  case BuiltinType::Id:                                                        \
+    return true;
 #include "clang/Basic/RISCVVTypes.def"
     default:
       return false;
@@ -2491,7 +2494,17 @@ QualType Type::getRVVEltType(const ASTContext &Ctx) const {
   assert(isRVVVLSBuiltinType() && "unsupported type!");
 
   const BuiltinType *BTy = castAs<BuiltinType>();
-  return Ctx.getBuiltinVectorTypeInfo(BTy).ElementType;
+
+  switch (BTy->getKind()) {
+#define RVV_PREDICATE_TYPE(Name, Id, SingletonId, NumEls)                      \
+  case BuiltinType::Id:                                                        \
+    return Ctx.UnsignedCharTy;
+  default:
+    return Ctx.getBuiltinVectorTypeInfo(BTy).ElementType;
+#include "clang/Basic/RISCVVTypes.def"
+  }
+
+  llvm_unreachable("Unhandled type");
 }
 
 bool QualType::isPODType(const ASTContext &Context) const {
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 80b42c8f84a00a..e9b6e810b02e8d 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -694,6 +694,7 @@ void TypePrinter::printVectorBefore(const VectorType *T, raw_ostream &OS) {
     printBefore(T->getElementType(), OS);
     break;
   case VectorKind::RVVFixedLengthData:
+  case VectorKind::RVVFixedLengthMask:
     // FIXME: We prefer to print the size directly here, but have no way
     // to get the size of the type.
     OS << "__attribute__((__riscv_rvv_vector_bits__(";
@@ -773,6 +774,7 @@ void TypePrinter::printDependentVectorBefore(
     printBefore(T->getElementType(), OS);
     break;
   case VectorKind::RVVFixedLengthData:
+  case VectorKind::RVVFixedLengthMask:
     // FIXME: We prefer to print the size directly here, but have no way
     // to get the size of the type.
     OS << "__attribute__((__riscv_rvv_vector_bits__(";
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index 925217431d4d02..0dac8748a98aff 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -301,10 +301,9 @@ bool Module::directlyUses(const Module *Requested) {
     if (Requested->isSubModuleOf(Use))
       return true;
 
-  // Anyone is allowed to use our builtin stdarg.h and stddef.h and their
-  // accompanying modules.
-  if (Requested->getTopLevelModuleName() == "_Builtin_stdarg" ||
-      Requested->getTopLevelModuleName() == "_Builtin_stddef")
+  // Anyone is allowed to use our builtin stddef.h and its accompanying modules.
+  if (Requested->fullModuleNameIs({"_Builtin_stddef", "max_align_t"}) ||
+      Requested->fullModuleNameIs({"_Builtin_stddef_wint_t"}))
     return true;
 
   if (NoUndeclaredIncludes)
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index d47181bfca4fc8..f5a5d689fa095c 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -258,7 +258,6 @@ void AArch64TargetInfo::getTargetDefinesARMV83A(const LangOptions &Opts,
                                                 MacroBuilder &Builder) const {
   Builder.defineMacro("__ARM_FEATURE_COMPLEX", "1");
   Builder.defineMacro("__ARM_FEATURE_JCVT", "1");
-  Builder.defineMacro("__ARM_FEATURE_PAUTH", "1");
   // Also include the Armv8.2 defines
   getTargetDefinesARMV82A(Opts, Builder);
 }
@@ -387,6 +386,11 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
 
   Builder.defineMacro("__ARM_ALIGN_MAX_STACK_PWR", "4");
 
+  // These macros are set when Clang can parse declarations with these
+  // attributes.
+  Builder.defineMacro("__ARM_STATE_ZA", "1");
+  Builder.defineMacro("__ARM_STATE_ZT0", "1");
+
   // 0xe implies support for half, single and double precision operations.
   if (FPU & FPUMode)
     Builder.defineMacro("__ARM_FP", "0xE");
@@ -431,6 +435,17 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasSVE2 && HasSVE2SM4)
     Builder.defineMacro("__ARM_FEATURE_SVE2_SM4", "1");
 
+  if (HasSME) {
+    Builder.defineMacro("__ARM_FEATURE_SME");
+    Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1");
+  }
+
+  if (HasSME2) {
+    Builder.defineMacro("__ARM_FEATURE_SME");
+    Builder.defineMacro("__ARM_FEATURE_SME2");
+    Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1");
+  }
+
   if (HasCRC)
     Builder.defineMacro("__ARM_FEATURE_CRC32", "1");
 
@@ -686,6 +701,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3)
       .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4)
       .Case("sme", HasSME)
+      .Case("sme2", HasSME2)
       .Case("sme-f64f64", HasSMEF64F64)
       .Case("sme-i16i64", HasSMEI16I64)
       .Case("sme-fa64", HasSMEFA64)
@@ -806,6 +822,12 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasBFloat16 = true;
       HasFullFP16 = true;
     }
+    if (Feature == "+sme2") {
+      HasSME = true;
+      HasSME2 = true;
+      HasBFloat16 = true;
+      HasFullFP16 = true;
+    }
     if (Feature == "+sme-f64f64") {
       HasSME = true;
       HasSMEF64F64 = true;
@@ -1164,6 +1186,8 @@ TargetInfo::BuiltinVaListKind AArch64TargetInfo::getBuiltinVaListKind() const {
 }
 
 const char *const AArch64TargetInfo::GCCRegNames[] = {
+    // clang-format off
+
     // 32-bit Integer registers
     "w0", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11",
     "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21", "w22",
@@ -1200,7 +1224,12 @@ const char *const AArch64TargetInfo::GCCRegNames[] = {
 
     // SVE predicate-as-counter registers
     "pn0",  "pn1",  "pn2",  "pn3",  "pn4",  "pn5",  "pn6",  "pn7",  "pn8",
-    "pn9",  "pn10", "pn11", "pn12", "pn13", "pn14", "pn15"
+    "pn9",  "pn10", "pn11", "pn12", "pn13", "pn14", "pn15",
+
+    // SME registers
+    "za", "zt0",
+
+    // clang-format on
 };
 
 ArrayRef<const char *> AArch64TargetInfo::getGCCRegNames() const {
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index f0e0782e7abe97..9699222b0bf773 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -68,6 +68,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasCCDP = false;
   bool HasFRInt3264 = false;
   bool HasSME = false;
+  bool HasSME2 = false;
   bool HasSMEF64F64 = false;
   bool HasSMEI16I64 = false;
   bool HasSB = false;
diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index f46b95abfd75c7..23d4e1b598fa1e 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -237,12 +237,14 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     case 'r': // CPU registers.
     case 'd': // Equivalent to "r" unless generating MIPS16 code.
     case 'y': // Equivalent to "r", backward compatibility only.
-    case 'f': // floating-point registers.
     case 'c': // $25 for indirect jumps
     case 'l': // lo register
     case 'x': // hilo register pair
       Info.setAllowsRegister();
       return true;
+    case 'f': // floating-point registers.
+      Info.setAllowsRegister();
+      return FloatABI != SoftFloat;
     case 'I': // Signed 16-bit constant
     case 'J': // Integer 0
     case 'K': // Unsigned 16-bit constant
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index ec203f6f28bc17..4f22d35f9d3a94 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -186,6 +186,14 @@ class EmitAssemblyHelper {
            TargetTriple.getVendor() != llvm::Triple::Apple;
   }
 
+  /// Check whether we should emit a flag for UnifiedLTO.
+  /// The UnifiedLTO module flag should be set when UnifiedLTO is enabled for
+  /// ThinLTO or Full LTO with module summaries.
+  bool shouldEmitUnifiedLTOModueFlag() const {
+    return CodeGenOpts.UnifiedLTO &&
+           (CodeGenOpts.PrepareForThinLTO || shouldEmitRegularLTOSummary());
+  }
+
 public:
   EmitAssemblyHelper(DiagnosticsEngine &_Diags,
                      const HeaderSearchOptions &HeaderSearchOpts,
@@ -401,6 +409,7 @@ static bool initTargetOptions(DiagnosticsEngine &Diags,
   Options.UniqueBasicBlockSectionNames =
       CodeGenOpts.UniqueBasicBlockSectionNames;
   Options.TLSSize = CodeGenOpts.TLSSize;
+  Options.EnableTLSDESC = CodeGenOpts.EnableTLSDESC;
   Options.EmulatedTLS = CodeGenOpts.EmulatedTLS;
   Options.DebuggerTuning = CodeGenOpts.getDebuggerTuning();
   Options.EmitStackSizeSection = CodeGenOpts.StackSizeSection;
@@ -1028,7 +1037,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
   if (!actionRequiresCodeGen(Action) && CodeGenOpts.VerifyModule)
     MPM.addPass(VerifierPass());
 
-  if (Action == Backend_EmitBC || Action == Backend_EmitLL) {
+  if (Action == Backend_EmitBC || Action == Backend_EmitLL ||
+      CodeGenOpts.FatLTO) {
     if (CodeGenOpts.PrepareForThinLTO && !CodeGenOpts.DisableLLVMPasses) {
       if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
         TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit",
@@ -1039,11 +1049,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
           if (!ThinLinkOS)
             return;
         }
-        if (CodeGenOpts.UnifiedLTO)
-          TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
         MPM.addPass(ThinLTOBitcodeWriterPass(
             *OS, ThinLinkOS ? &ThinLinkOS->os() : nullptr));
-      } else {
+      } else if (Action == Backend_EmitLL) {
         MPM.addPass(PrintModulePass(*OS, "", CodeGenOpts.EmitLLVMUseLists,
                                     /*EmitLTOSummary=*/true));
       }
@@ -1057,24 +1065,17 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
         if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
           TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit",
                                    uint32_t(1));
-        if (CodeGenOpts.UnifiedLTO)
-          TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
       }
-      if (Action == Backend_EmitBC)
+      if (Action == Backend_EmitBC) {
         MPM.addPass(BitcodeWriterPass(*OS, CodeGenOpts.EmitLLVMUseLists,
                                       EmitLTOSummary));
-      else
+      } else if (Action == Backend_EmitLL) {
         MPM.addPass(PrintModulePass(*OS, "", CodeGenOpts.EmitLLVMUseLists,
                                     EmitLTOSummary));
+      }
     }
-  }
-  if (CodeGenOpts.FatLTO) {
-    // Set the EnableSplitLTOUnit and UnifiedLTO module flags, since FatLTO
-    // uses a different action than Backend_EmitBC or Backend_EmitLL.
-    if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
-      TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit",
-                               uint32_t(CodeGenOpts.EnableSplitLTOUnit));
-    if (CodeGenOpts.UnifiedLTO && !TheModule->getModuleFlag("UnifiedLTO"))
+
+    if (shouldEmitUnifiedLTOModueFlag())
       TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
   }
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7ef764b8e1ac80..44ddd2428b10f5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -823,29 +823,32 @@ const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberField(
     ASTContext &Ctx, const RecordDecl *RD, StringRef Name, uint64_t &Offset) {
   const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
       getLangOpts().getStrictFlexArraysLevel();
-  unsigned FieldNo = 0;
-  bool IsUnion = RD->isUnion();
+  uint32_t FieldNo = 0;
 
-  for (const Decl *D : RD->decls()) {
-    if (const auto *Field = dyn_cast<FieldDecl>(D);
-        Field && (Name.empty() || Field->getNameAsString() == Name) &&
+  if (RD->isImplicit())
+    return nullptr;
+
+  for (const FieldDecl *FD : RD->fields()) {
+    if ((Name.empty() || FD->getNameAsString() == Name) &&
         Decl::isFlexibleArrayMemberLike(
-            Ctx, Field, Field->getType(), StrictFlexArraysLevel,
+            Ctx, FD, FD->getType(), StrictFlexArraysLevel,
             /*IgnoreTemplateOrMacroSubstitution=*/true)) {
       const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
       Offset += Layout.getFieldOffset(FieldNo);
-      return Field;
+      return FD;
     }
 
-    if (const auto *Record = dyn_cast<RecordDecl>(D))
-      if (const FieldDecl *Field =
-              FindFlexibleArrayMemberField(Ctx, Record, Name, Offset)) {
+    QualType Ty = FD->getType();
+    if (Ty->isRecordType()) {
+      if (const FieldDecl *Field = FindFlexibleArrayMemberField(
+              Ctx, Ty->getAsRecordDecl(), Name, Offset)) {
         const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
         Offset += Layout.getFieldOffset(FieldNo);
         return Field;
       }
+    }
 
-    if (!IsUnion && isa<FieldDecl>(D))
+    if (!RD->isUnion())
       ++FieldNo;
   }
 
@@ -18279,65 +18282,216 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
-  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
 
     // These operations perform a matrix multiplication and accumulation of
     // the form:
     //             D = A * B + C
-    // The return type always matches the type of matrix C.
-    unsigned ArgForMatchingRetType;
+    // We need to specify one type for matrices AB and one for matrices CD.
+    // Sparse matrix operations can have different types for A and B as well as
+    // an additional type for sparsity index.
+    // Destination type should be put before types used for source operands.
+    SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
+    // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
+    // There is no need for the variable opsel argument, so always set it to
+    // "false".
+    bool AppendFalseForOpselArg = false;
     unsigned BuiltinWMMAOp;
 
     switch (BuiltinID) {
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
-      ArgForMatchingRetType = 2;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
-      ArgForMatchingRetType = 2;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+      AppendFalseForOpselArg = true;
+      LLVM_FALLTHROUGH;
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+      AppendFalseForOpselArg = true;
+      LLVM_FALLTHROUGH;
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
-      ArgForMatchingRetType = 4;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
-      ArgForMatchingRetType = 4;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
+      break;
     }
 
     SmallVector<Value *, 6> Args;
     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
       Args.push_back(EmitScalarExpr(E->getArg(i)));
+    if (AppendFalseForOpselArg)
+      Args.push_back(Builder.getFalse());
 
-    Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
-                                   {Args[ArgForMatchingRetType]->getType()});
+    SmallVector<llvm::Type *, 6> ArgTypes;
+    for (auto ArgIdx : ArgsForMatchingMatrixTypes)
+      ArgTypes.push_back(Args[ArgIdx]->getType());
 
+    Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
     return Builder.CreateCall(F, Args);
   }
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 28c211aa631e4d..a6a2f3595fe7db 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1581,6 +1581,11 @@ bool CodeGenModule::ReturnTypeUsesSRet(const CGFunctionInfo &FI) {
   return RI.isIndirect() || (RI.isInAlloca() && RI.getInAllocaSRet());
 }
 
+bool CodeGenModule::ReturnTypeHasInReg(const CGFunctionInfo &FI) {
+  const auto &RI = FI.getReturnInfo();
+  return RI.getInReg();
+}
+
 bool CodeGenModule::ReturnSlotInterferesWithArgs(const CGFunctionInfo &FI) {
   return ReturnTypeUsesSRet(FI) &&
          getTargetCodeGenInfo().doesReturnSlotInterfereWithArgs();
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index bbe14ef4c17244..aa9997b87ecfa7 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1241,27 +1241,38 @@ static void emitStoresForConstant(CodeGenModule &CGM, const VarDecl &D,
     return;
   }
 
-  // If the initializer is small, use a handful of stores.
+  // If the initializer is small or trivialAutoVarInit is set, use a handful of
+  // stores.
+  bool IsTrivialAutoVarInitPattern =
+      CGM.getContext().getLangOpts().getTrivialAutoVarInit() ==
+      LangOptions::TrivialAutoVarInitKind::Pattern;
   if (shouldSplitConstantStore(CGM, ConstantSize)) {
     if (auto *STy = dyn_cast<llvm::StructType>(Ty)) {
-      const llvm::StructLayout *Layout =
-          CGM.getDataLayout().getStructLayout(STy);
-      for (unsigned i = 0; i != constant->getNumOperands(); i++) {
-        CharUnits CurOff = CharUnits::fromQuantity(Layout->getElementOffset(i));
-        Address EltPtr = Builder.CreateConstInBoundsByteGEP(
-            Loc.withElementType(CGM.Int8Ty), CurOff);
-        emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder,
-                              constant->getAggregateElement(i), IsAutoInit);
+      if (STy == Loc.getElementType() ||
+          (STy != Loc.getElementType() && IsTrivialAutoVarInitPattern)) {
+        const llvm::StructLayout *Layout =
+            CGM.getDataLayout().getStructLayout(STy);
+        for (unsigned i = 0; i != constant->getNumOperands(); i++) {
+          CharUnits CurOff =
+              CharUnits::fromQuantity(Layout->getElementOffset(i));
+          Address EltPtr = Builder.CreateConstInBoundsByteGEP(
+              Loc.withElementType(CGM.Int8Ty), CurOff);
+          emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder,
+                                constant->getAggregateElement(i), IsAutoInit);
+        }
+        return;
       }
-      return;
     } else if (auto *ATy = dyn_cast<llvm::ArrayType>(Ty)) {
-      for (unsigned i = 0; i != ATy->getNumElements(); i++) {
-        Address EltPtr = Builder.CreateConstGEP(
-            Loc.withElementType(ATy->getElementType()), i);
-        emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder,
-                              constant->getAggregateElement(i), IsAutoInit);
+      if (ATy == Loc.getElementType() ||
+          (ATy != Loc.getElementType() && IsTrivialAutoVarInitPattern)) {
+        for (unsigned i = 0; i != ATy->getNumElements(); i++) {
+          Address EltPtr = Builder.CreateConstGEP(
+              Loc.withElementType(ATy->getElementType()), i);
+          emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder,
+                                constant->getAggregateElement(i), IsAutoInit);
+        }
+        return;
       }
-      return;
     }
   }
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index c5f6b6d3a99f0b..f8f9979099775f 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5846,6 +5846,7 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee
   // destruction order is not necessarily reverse construction order.
   // FIXME: Revisit this based on C++ committee response to unimplementability.
   EvaluationOrder Order = EvaluationOrder::Default;
+  bool StaticOperator = false;
   if (auto *OCE = dyn_cast<CXXOperatorCallExpr>(E)) {
     if (OCE->isAssignmentOp())
       Order = EvaluationOrder::ForceRightToLeft;
@@ -5863,10 +5864,22 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee
         break;
       }
     }
+
+    if (const auto *MD =
+            dyn_cast_if_present<CXXMethodDecl>(OCE->getCalleeDecl());
+        MD && MD->isStatic())
+      StaticOperator = true;
   }
 
-  EmitCallArgs(Args, dyn_cast<FunctionProtoType>(FnType), E->arguments(),
-               E->getDirectCallee(), /*ParamsToSkip*/ 0, Order);
+  auto Arguments = E->arguments();
+  if (StaticOperator) {
+    // If we're calling a static operator, we need to emit the object argument
+    // and ignore it.
+    EmitIgnoredExpr(E->getArg(0));
+    Arguments = drop_begin(Arguments, 1);
+  }
+  EmitCallArgs(Args, dyn_cast<FunctionProtoType>(FnType), Arguments,
+               E->getDirectCallee(), /*ParamsToSkip=*/0, Order);
 
   const CGFunctionInfo &FnInfo = CGM.getTypes().arrangeFreeFunctionCall(
       Args, FnType, /*ChainCall=*/Chain);
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index a36b0cdddaf0af..05e3f8d4bfc2a3 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -2903,23 +2903,29 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF,
       break;
     case CodeGenOptions::Mixed:
     case CodeGenOptions::NonLegacy:
+      StringRef name = "objc_msgSend";
       if (CGM.ReturnTypeUsesFPRet(ResultType)) {
-        imp =
-            CGM.CreateRuntimeFunction(llvm::FunctionType::get(IdTy, IdTy, true),
-                                      "objc_msgSend_fpret")
-                .getCallee();
+        name = "objc_msgSend_fpret";
       } else if (CGM.ReturnTypeUsesSRet(MSI.CallInfo)) {
-        // The actual types here don't matter - we're going to bitcast the
-        // function anyway
-        imp =
-            CGM.CreateRuntimeFunction(llvm::FunctionType::get(IdTy, IdTy, true),
-                                      "objc_msgSend_stret")
-                .getCallee();
-      } else {
-        imp = CGM.CreateRuntimeFunction(
-                     llvm::FunctionType::get(IdTy, IdTy, true), "objc_msgSend")
-                  .getCallee();
+        name = "objc_msgSend_stret";
+
+        // The address of the memory block is be passed in x8 for POD type,
+        // or in x0 for non-POD type (marked as inreg).
+        bool shouldCheckForInReg =
+            CGM.getContext()
+                .getTargetInfo()
+                .getTriple()
+                .isWindowsMSVCEnvironment() &&
+            CGM.getContext().getTargetInfo().getTriple().isAArch64();
+        if (shouldCheckForInReg && CGM.ReturnTypeHasInReg(MSI.CallInfo)) {
+          name = "objc_msgSend_stret2";
+        }
       }
+      // The actual types here don't matter - we're going to bitcast the
+      // function anyway
+      imp = CGM.CreateRuntimeFunction(llvm::FunctionType::get(IdTy, IdTy, true),
+                                      name)
+                .getCallee();
     }
 
   // Reset the receiver in case the lookup modified it
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index ec34680fd3f7e6..d9ece4d98eecae 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1239,6 +1239,9 @@ class CodeGenModule : public CodeGenTypeCache {
   /// Return true iff the given type uses 'sret' when used as a return type.
   bool ReturnTypeUsesSRet(const CGFunctionInfo &FI);
 
+  /// Return true iff the given type has `inreg` set.
+  bool ReturnTypeHasInReg(const CGFunctionInfo &FI);
+
   /// Return true iff the given type uses an argument slot when 'sret' is used
   /// as a return type.
   bool ReturnSlotInterferesWithArgs(const CGFunctionInfo &FI);
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 5d7c3847745762..fb4e86e8bd8053 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -240,9 +240,12 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     if (MCDCMaxCond == 0)
       return true;
 
-    /// At the top of the logical operator nest, reset the number of conditions.
-    if (LogOpStack.empty())
+    /// At the top of the logical operator nest, reset the number of conditions,
+    /// also forget previously seen split nesting cases.
+    if (LogOpStack.empty()) {
       NumCond = 0;
+      SplitNestedLogicalOp = false;
+    }
 
     if (const Expr *E = dyn_cast<Expr>(S)) {
       const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(E->IgnoreParens());
@@ -293,7 +296,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
                 "contains an operation with a nested boolean expression. "
                 "Expression will not be covered");
             Diag.Report(S->getBeginLoc(), DiagID);
-            return false;
+            return true;
           }
 
           /// Was the maximum number of conditions encountered?
@@ -304,7 +307,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
                 "number of conditions (%0) exceeds max (%1). "
                 "Expression will not be covered");
             Diag.Report(S->getBeginLoc(), DiagID) << NumCond << MCDCMaxCond;
-            return false;
+            return true;
           }
 
           // Otherwise, allocate the number of bytes required for the bitmap
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 5eca00f22bb83c..ae4e6d4c88c02d 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -1207,6 +1207,12 @@ struct CounterCoverageMappingBuilder
   /// Find a valid gap range between \p AfterLoc and \p BeforeLoc.
   std::optional<SourceRange> findGapAreaBetween(SourceLocation AfterLoc,
                                                 SourceLocation BeforeLoc) {
+    // Some statements (like AttributedStmt and ImplicitValueInitExpr) don't
+    // have valid source locations. Do not emit a gap region if this is the case
+    // in either AfterLoc end or BeforeLoc end.
+    if (AfterLoc.isInvalid() || BeforeLoc.isInvalid())
+      return std::nullopt;
+
     // If AfterLoc is in function-like macro, use the right parenthesis
     // location.
     if (AfterLoc.isMacroID()) {
@@ -1370,9 +1376,8 @@ struct CounterCoverageMappingBuilder
     for (const Stmt *Child : S->children())
       if (Child) {
         // If last statement contains terminate statements, add a gap area
-        // between the two statements. Skipping attributed statements, because
-        // they don't have valid start location.
-        if (LastStmt && HasTerminateStmt && !isa<AttributedStmt>(Child)) {
+        // between the two statements.
+        if (LastStmt && HasTerminateStmt) {
           auto Gap = findGapAreaBetween(getEnd(LastStmt), getStart(Child));
           if (Gap)
             fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(),
@@ -1812,8 +1817,10 @@ struct CounterCoverageMappingBuilder
     assert(S->isConstexpr());
 
     // evaluate constant condition...
-    const auto *E = cast<ConstantExpr>(S->getCond());
-    const bool isTrue = E->getResultAsAPSInt().getExtValue();
+    const bool isTrue =
+        S->getCond()
+            ->EvaluateKnownConstInt(CVM.getCodeGenModule().getContext())
+            .getBoolValue();
 
     extendRegion(S);
 
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 172c4c937b9728..4d0f4c63f843b8 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1135,9 +1135,15 @@ static bool isTrivialForMSVC(const CXXRecordDecl *RD, QualType Ty,
     return false;
   if (RD->hasNonTrivialCopyAssignment())
     return false;
-  for (const CXXConstructorDecl *Ctor : RD->ctors())
-    if (Ctor->isUserProvided())
-      return false;
+  for (const Decl *D : RD->decls()) {
+    if (auto *Ctor = dyn_cast<CXXConstructorDecl>(D)) {
+      if (Ctor->isUserProvided())
+        return false;
+    } else if (auto *Template = dyn_cast<FunctionTemplateDecl>(D)) {
+      if (isa<CXXConstructorDecl>(Template->getTemplatedDecl()))
+        return false;
+    }
+  }
   if (RD->hasNonTrivialDestructor())
     return false;
   return true;
diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index 0851d1993d0c0f..02c86ad2e58cac 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -321,20 +321,28 @@ ABIArgInfo RISCVABIInfo::coerceVLSVector(QualType Ty) const {
   assert(Ty->isVectorType() && "expected vector type!");
 
   const auto *VT = Ty->castAs<VectorType>();
-  assert(VT->getVectorKind() == VectorKind::RVVFixedLengthData &&
-         "Unexpected vector kind");
-
   assert(VT->getElementType()->isBuiltinType() && "expected builtin type!");
 
   auto VScale =
       getContext().getTargetInfo().getVScaleRange(getContext().getLangOpts());
+
+  unsigned NumElts = VT->getNumElements();
+  llvm::Type *EltType;
+  if (VT->getVectorKind() == VectorKind::RVVFixedLengthMask) {
+    NumElts *= 8;
+    EltType = llvm::Type::getInt1Ty(getVMContext());
+  } else {
+    assert(VT->getVectorKind() == VectorKind::RVVFixedLengthData &&
+           "Unexpected vector kind");
+    EltType = CGT.ConvertType(VT->getElementType());
+  }
+
   // The MinNumElts is simplified from equation:
   // NumElts / VScale =
   //  (EltSize * NumElts / (VScale * RVVBitsPerBlock))
   //    * (RVVBitsPerBlock / EltSize)
   llvm::ScalableVectorType *ResType =
-      llvm::ScalableVectorType::get(CGT.ConvertType(VT->getElementType()),
-                                    VT->getNumElements() / VScale->first);
+      llvm::ScalableVectorType::get(EltType, NumElts / VScale->first);
   return ABIArgInfo::getDirect(ResType);
 }
 
@@ -437,7 +445,8 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
   }
 
   if (const VectorType *VT = Ty->getAs<VectorType>())
-    if (VT->getVectorKind() == VectorKind::RVVFixedLengthData)
+    if (VT->getVectorKind() == VectorKind::RVVFixedLengthData ||
+        VT->getVectorKind() == VectorKind::RVVFixedLengthMask)
       return coerceVLSVector(Ty);
 
   // Aggregates which are <= 2*XLen will be passed in registers if possible,
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 7109faa1072de5..93cddf742d521d 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4764,9 +4764,9 @@ Action *Driver::ConstructPhaseAction(
   case phases::Backend: {
     if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) {
       types::ID Output;
-      if (Args.hasArg(options::OPT_ffat_lto_objects))
-        Output = Args.hasArg(options::OPT_emit_llvm) ? types::TY_LTO_IR
-                                                     : types::TY_PP_Asm;
+      if (Args.hasArg(options::OPT_ffat_lto_objects) &&
+          !Args.hasArg(options::OPT_emit_llvm))
+        Output = types::TY_PP_Asm;
       else if (Args.hasArg(options::OPT_S))
         Output = types::TY_LTO_IR;
       else
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 22e583021515e5..ae1a4ba7882627 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -178,4 +178,85 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
     else
       Features.push_back("-hard-quad-float");
   }
+
+  if (Args.hasArg(options::OPT_ffixed_g1))
+    Features.push_back("+reserve-g1");
+
+  if (Args.hasArg(options::OPT_ffixed_g2))
+    Features.push_back("+reserve-g2");
+
+  if (Args.hasArg(options::OPT_ffixed_g3))
+    Features.push_back("+reserve-g3");
+
+  if (Args.hasArg(options::OPT_ffixed_g4))
+    Features.push_back("+reserve-g4");
+
+  if (Args.hasArg(options::OPT_ffixed_g5))
+    Features.push_back("+reserve-g5");
+
+  if (Args.hasArg(options::OPT_ffixed_g6))
+    Features.push_back("+reserve-g6");
+
+  if (Args.hasArg(options::OPT_ffixed_g7))
+    Features.push_back("+reserve-g7");
+
+  if (Args.hasArg(options::OPT_ffixed_o0))
+    Features.push_back("+reserve-o0");
+
+  if (Args.hasArg(options::OPT_ffixed_o1))
+    Features.push_back("+reserve-o1");
+
+  if (Args.hasArg(options::OPT_ffixed_o2))
+    Features.push_back("+reserve-o2");
+
+  if (Args.hasArg(options::OPT_ffixed_o3))
+    Features.push_back("+reserve-o3");
+
+  if (Args.hasArg(options::OPT_ffixed_o4))
+    Features.push_back("+reserve-o4");
+
+  if (Args.hasArg(options::OPT_ffixed_o5))
+    Features.push_back("+reserve-o5");
+
+  if (Args.hasArg(options::OPT_ffixed_l0))
+    Features.push_back("+reserve-l0");
+
+  if (Args.hasArg(options::OPT_ffixed_l1))
+    Features.push_back("+reserve-l1");
+
+  if (Args.hasArg(options::OPT_ffixed_l2))
+    Features.push_back("+reserve-l2");
+
+  if (Args.hasArg(options::OPT_ffixed_l3))
+    Features.push_back("+reserve-l3");
+
+  if (Args.hasArg(options::OPT_ffixed_l4))
+    Features.push_back("+reserve-l4");
+
+  if (Args.hasArg(options::OPT_ffixed_l5))
+    Features.push_back("+reserve-l5");
+
+  if (Args.hasArg(options::OPT_ffixed_l6))
+    Features.push_back("+reserve-l6");
+
+  if (Args.hasArg(options::OPT_ffixed_l7))
+    Features.push_back("+reserve-l7");
+
+  if (Args.hasArg(options::OPT_ffixed_i0))
+    Features.push_back("+reserve-i0");
+
+  if (Args.hasArg(options::OPT_ffixed_i1))
+    Features.push_back("+reserve-i1");
+
+  if (Args.hasArg(options::OPT_ffixed_i2))
+    Features.push_back("+reserve-i2");
+
+  if (Args.hasArg(options::OPT_ffixed_i3))
+    Features.push_back("+reserve-i3");
+
+  if (Args.hasArg(options::OPT_ffixed_i4))
+    Features.push_back("+reserve-i4");
+
+  if (Args.hasArg(options::OPT_ffixed_i5))
+    Features.push_back("+reserve-i5");
 }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 5dc614e11aab59..aa344b3465ab27 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3942,6 +3942,10 @@ static bool RenderModulesOptions(Compilation &C, const Driver &D,
     Args.ClaimAllArgs(options::OPT_fmodules_disable_diagnostic_validation);
   }
 
+  // FIXME: We provisionally don't check ODR violations for decls in the global
+  // module fragment.
+  CmdArgs.push_back("-fskip-odr-check-in-gmf");
+
   // Claim `-fmodule-output` and `-fmodule-output=` to avoid unused warnings.
   Args.ClaimAllArgs(options::OPT_fmodule_output);
   Args.ClaimAllArgs(options::OPT_fmodule_output_EQ);
@@ -5779,6 +5783,14 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       // NVPTX/AMDGPU does not care about the code model and will accept
       // whatever works for the host.
       Ok = true;
+    } else if (Triple.isSPARC64()) {
+      if (CM == "medlow")
+        CM = "small";
+      else if (CM == "medmid")
+        CM = "medium";
+      else if (CM == "medany")
+        CM = "large";
+      Ok = CM == "small" || CM == "medium" || CM == "large";
     }
     if (Ok) {
       CmdArgs.push_back(Args.MakeArgString("-mcmodel=" + CM));
@@ -5822,6 +5834,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     Args.AddLastArg(CmdArgs, options::OPT_mtls_size_EQ);
   }
 
+  if (isTLSDESCEnabled(TC, Args))
+    CmdArgs.push_back("-enable-tlsdesc");
+
   // Add the target cpu
   std::string CPU = getCPUName(D, Args, Triple, /*FromAs*/ false);
   if (!CPU.empty()) {
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index fadaf3e60c6616..2b916f0003368d 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -729,6 +729,33 @@ bool tools::isUseSeparateSections(const llvm::Triple &Triple) {
   return Triple.isPS();
 }
 
+bool tools::isTLSDESCEnabled(const ToolChain &TC,
+                             const llvm::opt::ArgList &Args) {
+  const llvm::Triple &Triple = TC.getEffectiveTriple();
+  Arg *A = Args.getLastArg(options::OPT_mtls_dialect_EQ);
+  if (!A)
+    return Triple.hasDefaultTLSDESC();
+  StringRef V = A->getValue();
+  bool SupportedArgument = false, EnableTLSDESC = false;
+  bool Unsupported = !Triple.isOSBinFormatELF();
+  if (Triple.isRISCV()) {
+    SupportedArgument = V == "desc" || V == "trad";
+    EnableTLSDESC = V == "desc";
+  } else if (Triple.isX86()) {
+    SupportedArgument = V == "gnu";
+  } else {
+    Unsupported = true;
+  }
+  if (Unsupported) {
+    TC.getDriver().Diag(diag::err_drv_unsupported_opt_for_target)
+        << A->getSpelling() << Triple.getTriple();
+  } else if (!SupportedArgument) {
+    TC.getDriver().Diag(diag::err_drv_unsupported_option_argument_for_target)
+        << A->getSpelling() << V << Triple.getTriple();
+  }
+  return EnableTLSDESC;
+}
+
 void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
                           ArgStringList &CmdArgs, const InputInfo &Output,
                           const InputInfo &Input, bool IsThinLTO) {
@@ -783,6 +810,28 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
                                          "-generate-arange-section"));
   }
 
+  // Pass vector library arguments to LTO.
+  Arg *ArgVecLib = Args.getLastArg(options::OPT_fveclib);
+  if (ArgVecLib && ArgVecLib->getNumValues() == 1) {
+    // Map the vector library names from clang front-end to opt front-end. The
+    // values are taken from the TargetLibraryInfo class command line options.
+    std::optional<StringRef> OptVal =
+        llvm::StringSwitch<std::optional<StringRef>>(ArgVecLib->getValue())
+            .Case("Accelerate", "Accelerate")
+            .Case("LIBMVEC", "LIBMVEC-X86")
+            .Case("MASSV", "MASSV")
+            .Case("SVML", "SVML")
+            .Case("SLEEF", "sleefgnuabi")
+            .Case("Darwin_libsystem_m", "Darwin_libsystem_m")
+            .Case("ArmPL", "ArmPL")
+            .Case("none", "none")
+            .Default(std::nullopt);
+
+    if (OptVal)
+      CmdArgs.push_back(Args.MakeArgString(
+          Twine(PluginOptPrefix) + "-vector-library=" + OptVal.value()));
+  }
+
   // Try to pass driver level flags relevant to LTO code generation down to
   // the plugin.
 
@@ -988,6 +1037,9 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + "-emulated-tls"));
   }
+  if (isTLSDESCEnabled(ToolChain, Args))
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine(PluginOptPrefix) + "-enable-tlsdesc"));
 
   if (Args.hasFlag(options::OPT_fstack_size_section,
                    options::OPT_fno_stack_size_section, false))
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index 25d68345a9f9eb..807867f13a5c30 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -144,6 +144,9 @@ llvm::StringRef getLTOParallelism(const llvm::opt::ArgList &Args,
 bool areOptimizationsEnabled(const llvm::opt::ArgList &Args);
 
 bool isUseSeparateSections(const llvm::Triple &Triple);
+// Parse -mtls-dialect=. Return true if the target supports both general-dynamic
+// and TLSDESC, and TLSDESC is requested.
+bool isTLSDESCEnabled(const ToolChain &TC, const llvm::opt::ArgList &Args);
 
 /// \p EnvVar is split by system delimiter for environment variables.
 /// If \p ArgName is "-I", "-L", or an empty string, each entry from \p EnvVar
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 57f4600727ec89..0b16b660364f07 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -44,8 +44,15 @@ std::string wasm::Linker::getLinkerPath(const ArgList &Args) const {
           llvm::sys::fs::can_execute(UseLinker))
         return std::string(UseLinker);
 
-      // Accept 'lld', and 'ld' as aliases for the default linker
-      if (UseLinker != "lld" && UseLinker != "ld")
+      // Interpret 'lld' as explicitly requesting `wasm-ld`, so look for that
+      // linker. Note that for `wasm32-wasip2` this overrides the default linker
+      // of `wasm-component-ld`.
+      if (UseLinker == "lld") {
+        return ToolChain.GetProgramPath("wasm-ld");
+      }
+
+      // Allow 'ld' as an alias for the default linker
+      if (UseLinker != "ld")
         ToolChain.getDriver().Diag(diag::err_drv_invalid_linker_name)
             << A->getAsString(Args);
     }
@@ -73,6 +80,16 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.hasArg(options::OPT_s))
     CmdArgs.push_back("--strip-all");
 
+  // On `wasip2` the default linker is `wasm-component-ld` which wraps the
+  // execution of `wasm-ld`. Find `wasm-ld` and pass it as an argument of where
+  // to find it to avoid it needing to hunt and rediscover or search `PATH` for
+  // where it is.
+  if (llvm::sys::path::stem(Linker).ends_with_insensitive(
+          "wasm-component-ld")) {
+    CmdArgs.push_back("--wasm-ld-path");
+    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetProgramPath("wasm-ld")));
+  }
+
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_u});
 
   ToolChain.AddFilePathLibArgs(Args, CmdArgs);
@@ -221,6 +238,12 @@ WebAssembly::WebAssembly(const Driver &D, const llvm::Triple &Triple,
   }
 }
 
+const char *WebAssembly::getDefaultLinker() const {
+  if (getOS() == "wasip2")
+    return "wasm-component-ld";
+  return "wasm-ld";
+}
+
 bool WebAssembly::IsMathErrnoDefault() const { return false; }
 
 bool WebAssembly::IsObjCNonFragileABIDefault() const { return true; }
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.h b/clang/lib/Driver/ToolChains/WebAssembly.h
index ae60f464c10818..76e0ca39bd748d 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.h
+++ b/clang/lib/Driver/ToolChains/WebAssembly.h
@@ -67,7 +67,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssembly final : public ToolChain {
                            llvm::opt::ArgStringList &CmdArgs) const override;
   SanitizerMask getSupportedSanitizers() const override;
 
-  const char *getDefaultLinker() const override { return "wasm-ld"; }
+  const char *getDefaultLinker() const override;
 
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index a3eb9138b21833..53cd169b05904a 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -674,7 +674,13 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
         // arguments to function calls. We do this by ensuring that either all
         // arguments (including any lambdas) go on the same line as the function
         // call, or we break before the first argument.
-        auto PrevNonComment = Current.getPreviousNonComment();
+        const auto *Prev = Current.Previous;
+        if (!Prev)
+          return false;
+        // For example, `/*Newline=*/false`.
+        if (Prev->is(TT_BlockComment) && Current.SpacesRequiredBefore == 0)
+          return false;
+        const auto *PrevNonComment = Current.getPreviousNonComment();
         if (!PrevNonComment || PrevNonComment->isNot(tok::l_paren))
           return false;
         if (Current.isOneOf(tok::comment, tok::l_paren, TT_LambdaLSquare))
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index ff326dc784783b..0bbce92e962d49 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -504,22 +504,6 @@ struct ScalarEnumerationTraits<FormatStyle::QualifierAlignmentStyle> {
   }
 };
 
-template <>
-struct MappingTraits<
-    FormatStyle::SpaceBeforeParensCustom::AfterPlacementOperatorStyle> {
-  static void
-  mapping(IO &IO,
-          FormatStyle::SpaceBeforeParensCustom::AfterPlacementOperatorStyle
-              &Value) {
-    IO.enumCase(Value, "Always",
-                FormatStyle::SpaceBeforeParensCustom::APO_Always);
-    IO.enumCase(Value, "Never",
-                FormatStyle::SpaceBeforeParensCustom::APO_Never);
-    IO.enumCase(Value, "Leave",
-                FormatStyle::SpaceBeforeParensCustom::APO_Leave);
-  }
-};
-
 template <> struct MappingTraits<FormatStyle::RawStringFormat> {
   static void mapping(IO &IO, FormatStyle::RawStringFormat &Format) {
     IO.mapOptional("Language", Format.Language);
@@ -1388,12 +1372,9 @@ static void expandPresetsSpaceBeforeParens(FormatStyle &Expanded) {
     return;
   // Reset all flags
   Expanded.SpaceBeforeParensOptions = {};
+  Expanded.SpaceBeforeParensOptions.AfterPlacementOperator = true;
 
   switch (Expanded.SpaceBeforeParens) {
-  case FormatStyle::SBPO_Never:
-    Expanded.SpaceBeforeParensOptions.AfterPlacementOperator =
-        FormatStyle::SpaceBeforeParensCustom::APO_Never;
-    break;
   case FormatStyle::SBPO_ControlStatements:
     Expanded.SpaceBeforeParensOptions.AfterControlStatements = true;
     Expanded.SpaceBeforeParensOptions.AfterForeachMacros = true;
@@ -1405,8 +1386,6 @@ static void expandPresetsSpaceBeforeParens(FormatStyle &Expanded) {
   case FormatStyle::SBPO_NonEmptyParentheses:
     Expanded.SpaceBeforeParensOptions.BeforeNonEmptyParentheses = true;
     break;
-  case FormatStyle::SBPO_Always:
-    break;
   default:
     break;
   }
@@ -3942,12 +3921,13 @@ const char *DefaultFallbackStyle = "LLVM";
 
 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
 loadAndParseConfigFile(StringRef ConfigFile, llvm::vfs::FileSystem *FS,
-                       FormatStyle *Style, bool AllowUnknownOptions) {
+                       FormatStyle *Style, bool AllowUnknownOptions,
+                       llvm::SourceMgr::DiagHandlerTy DiagHandler = nullptr) {
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
       FS->getBufferForFile(ConfigFile.str());
   if (auto EC = Text.getError())
     return EC;
-  if (auto EC = parseConfiguration(*Text.get(), Style, AllowUnknownOptions))
+  if (auto EC = parseConfiguration(*Text.get(), Style, AllowUnknownOptions, DiagHandler))
     return EC;
   return Text;
 }
@@ -3955,7 +3935,8 @@ loadAndParseConfigFile(StringRef ConfigFile, llvm::vfs::FileSystem *FS,
 llvm::Expected<FormatStyle> getStyle(StringRef StyleName, StringRef FileName,
                                      StringRef FallbackStyleName,
                                      StringRef Code, llvm::vfs::FileSystem *FS,
-                                     bool AllowUnknownOptions) {
+                                     bool AllowUnknownOptions,
+                                     llvm::SourceMgr::DiagHandlerTy DiagHandler) {
   FormatStyle Style = getLLVMStyle(guessLanguage(FileName, Code));
   FormatStyle FallbackStyle = getNoStyle();
   if (!getPredefinedStyle(FallbackStyleName, Style.Language, &FallbackStyle))
@@ -3969,7 +3950,7 @@ llvm::Expected<FormatStyle> getStyle(StringRef StyleName, StringRef FileName,
     StringRef Source = "<command-line>";
     if (std::error_code ec =
             parseConfiguration(llvm::MemoryBufferRef(StyleName, Source), &Style,
-                               AllowUnknownOptions)) {
+                               AllowUnknownOptions, DiagHandler)) {
       return make_string_error("Error parsing -style: " + ec.message());
     }
 
@@ -3989,7 +3970,7 @@ llvm::Expected<FormatStyle> getStyle(StringRef StyleName, StringRef FileName,
       StyleName.starts_with_insensitive("file:")) {
     auto ConfigFile = StyleName.substr(5);
     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
-        loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions);
+        loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions, DiagHandler);
     if (auto EC = Text.getError()) {
       return make_string_error("Error reading " + ConfigFile + ": " +
                                EC.message());
@@ -4024,12 +4005,13 @@ llvm::Expected<FormatStyle> getStyle(StringRef StyleName, StringRef FileName,
   // Reset possible inheritance
   Style.InheritsParentConfig = false;
 
-  auto dropDiagnosticHandler = [](const llvm::SMDiagnostic &, void *) {};
+  auto diagHandlerOrDropHandling =
+      DiagHandler ? DiagHandler : [](llvm::SMDiagnostic const &, void *) {};
 
   auto applyChildFormatTexts = [&](FormatStyle *Style) {
     for (const auto &MemBuf : llvm::reverse(ChildFormatTextToApply)) {
       auto EC = parseConfiguration(*MemBuf, Style, AllowUnknownOptions,
-                                   dropDiagnosticHandler);
+                                   diagHandlerOrDropHandling);
       // It was already correctly parsed.
       assert(!EC);
       static_cast<void>(EC);
@@ -4063,7 +4045,7 @@ llvm::Expected<FormatStyle> getStyle(StringRef StyleName, StringRef FileName,
       }
 
       llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
-          loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions);
+          loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions, DiagHandler);
       if (auto EC = Text.getError()) {
         if (EC != ParseError::Unsuitable) {
           return make_string_error("Error reading " + ConfigFile + ": " +
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 25fcceb8786437..c1f16624819223 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2488,6 +2488,8 @@ class AnnotatingParser {
         (Tok.Next->Next->is(tok::numeric_constant) || Line.InPPDirective)) {
       return false;
     }
+    if (Line.InPPDirective && Tok.Next->is(tok::minus))
+      return false;
     // Search for unexpected tokens.
     for (FormatToken *Prev = Tok.Previous; Prev != Tok.MatchingParen;
          Prev = Prev->Previous) {
@@ -3448,10 +3450,11 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
   for (AnnotatedLine *ChildLine : Line.Children)
     calculateFormattingInformation(*ChildLine);
 
-  Line.First->TotalLength =
-      Line.First->IsMultiline ? Style.ColumnLimit
-                              : Line.FirstStartColumn + Line.First->ColumnWidth;
-  FormatToken *Current = Line.First->Next;
+  auto *First = Line.First;
+  First->TotalLength = First->IsMultiline
+                           ? Style.ColumnLimit
+                           : Line.FirstStartColumn + First->ColumnWidth;
+  FormatToken *Current = First->Next;
   bool InFunctionDecl = Line.MightBeFunctionDecl;
   bool AlignArrayOfStructures =
       (Style.AlignArrayOfStructures != FormatStyle::AIAS_None &&
@@ -3473,16 +3476,15 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
     if (const bool IsCtorOrDtor = Tok->is(TT_CtorDtorDeclName);
         IsCtorOrDtor ||
         isFunctionDeclarationName(Style.isCpp(), *Tok, Line, ClosingParen)) {
-      if (!IsCtorOrDtor) {
-        LineIsFunctionDeclaration = true;
+      if (!IsCtorOrDtor)
         Tok->setFinalizedType(TT_FunctionDeclarationName);
-      }
+      LineIsFunctionDeclaration = true;
       SeenName = true;
       break;
     }
   }
 
-  if (IsCpp && LineIsFunctionDeclaration &&
+  if (IsCpp && (LineIsFunctionDeclaration || First->is(TT_CtorDtorDeclName)) &&
       Line.endsWith(tok::semi, tok::r_brace)) {
     auto *Tok = Line.Last->Previous;
     while (Tok->isNot(tok::r_brace))
@@ -3505,7 +3507,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
   if (IsCpp) {
     if (!LineIsFunctionDeclaration) {
       // Annotate */&/&& in `operator` function calls as binary operators.
-      for (const auto *Tok = Line.First; Tok; Tok = Tok->Next) {
+      for (const auto *Tok = First; Tok; Tok = Tok->Next) {
         if (Tok->isNot(tok::kw_operator))
           continue;
         do {
@@ -3530,6 +3532,8 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
       }
     } else if (ClosingParen) {
       for (auto *Tok = ClosingParen->Next; Tok; Tok = Tok->Next) {
+        if (Tok->is(TT_CtorInitializerColon))
+          break;
         if (Tok->is(tok::arrow)) {
           Tok->setType(TT_TrailingReturnArrow);
           break;
@@ -3642,7 +3646,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
 
   calculateUnbreakableTailLengths(Line);
   unsigned IndentLevel = Line.Level;
-  for (Current = Line.First; Current; Current = Current->Next) {
+  for (Current = First; Current; Current = Current->Next) {
     if (Current->Role)
       Current->Role->precomputeFormattingInfos(Current);
     if (Current->MatchingParen &&
@@ -4272,14 +4276,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
         Left.isOneOf(tok::kw_new, tok::kw_delete) &&
         Right.isNot(TT_OverloadedOperatorLParen) &&
         !(Line.MightBeFunctionDecl && Left.is(TT_FunctionDeclarationName))) {
-      if (Style.SpaceBeforeParensOptions.AfterPlacementOperator ==
-              FormatStyle::SpaceBeforeParensCustom::APO_Always ||
-          (Style.SpaceBeforeParensOptions.AfterPlacementOperator ==
-               FormatStyle::SpaceBeforeParensCustom::APO_Leave &&
-           Right.hasWhitespaceBefore())) {
-        return true;
-      }
-      return false;
+      return Style.SpaceBeforeParensOptions.AfterPlacementOperator;
     }
     if (Line.Type == LT_ObjCDecl)
       return true;
@@ -5162,12 +5159,8 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
     return true;
   if (Left.IsUnterminatedLiteral)
     return true;
-  // FIXME: Breaking after newlines seems useful in general. Turn this into an
-  // option and recognize more cases like endl etc, and break independent of
-  // what comes after operator lessless.
-  if (Right.is(tok::lessless) && Right.Next &&
-      Right.Next->is(tok::string_literal) && Left.is(tok::string_literal) &&
-      Left.TokenText.ends_with("\\n\"")) {
+  if (Right.is(tok::lessless) && Right.Next && Left.is(tok::string_literal) &&
+      Right.Next->is(tok::string_literal)) {
     return true;
   }
   if (Right.is(TT_RequiresClause)) {
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index b904e0e56d9eb3..a6eb18bb2b3227 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -489,18 +489,23 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
   };
   SmallVector<StackEntry, 8> LBraceStack;
   assert(Tok->is(tok::l_brace));
+
   do {
-    // Get next non-comment, non-preprocessor token.
     FormatToken *NextTok;
     do {
       NextTok = Tokens->getNextToken();
     } while (NextTok->is(tok::comment));
-    while (NextTok->is(tok::hash) && !Line->InMacroBody) {
-      NextTok = Tokens->getNextToken();
-      do {
-        NextTok = Tokens->getNextToken();
-      } while (NextTok->is(tok::comment) ||
-               (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof)));
+
+    if (!Line->InMacroBody) {
+      // Skip PPDirective lines and comments.
+      while (NextTok->is(tok::hash)) {
+        do {
+          NextTok = Tokens->getNextToken();
+        } while (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof));
+
+        while (NextTok->is(tok::comment))
+          NextTok = Tokens->getNextToken();
+      }
     }
 
     switch (Tok->Tok.getKind()) {
@@ -534,16 +539,6 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
         if (Style.Language == FormatStyle::LK_Proto) {
           ProbablyBracedList = NextTok->isOneOf(tok::comma, tok::r_square);
         } else {
-          // Skip NextTok over preprocessor lines, otherwise we may not
-          // properly diagnose the block as a braced intializer
-          // if the comma separator appears after the pp directive.
-          while (NextTok->is(tok::hash)) {
-            ScopedMacroState MacroState(*Line, Tokens, NextTok);
-            do {
-              NextTok = Tokens->getNextToken();
-            } while (NextTok->isNot(tok::eof));
-          }
-
           // Using OriginalColumn to distinguish between ObjC methods and
           // binary operators is a bit hacky.
           bool NextIsObjCMethod = NextTok->isOneOf(tok::plus, tok::minus) &&
@@ -602,6 +597,16 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
             NextTok = Tokens->getNextToken();
             ProbablyBracedList = NextTok->isNot(tok::l_square);
           }
+
+          // Cpp macro definition body that is a nonempty braced list or block:
+          if (Style.isCpp() && Line->InMacroBody && PrevTok != FormatTok &&
+              !FormatTok->Previous && NextTok->is(tok::eof) &&
+              // A statement can end with only `;` (simple statement), a block
+              // closing brace (compound statement), or `:` (label statement).
+              // If PrevTok is a block opening brace, Tok ends an empty block.
+              !PrevTok->isOneOf(tok::semi, BK_Block, tok::colon)) {
+            ProbablyBracedList = true;
+          }
         }
         if (ProbablyBracedList) {
           Tok->setBlockKind(BK_BracedInit);
@@ -631,6 +636,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
     default:
       break;
     }
+
     PrevTok = Tok;
     Tok = NextTok;
   } while (Tok->isNot(tok::eof) && !LBraceStack.empty());
@@ -2515,7 +2521,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         parseChildBlock();
       break;
     case tok::r_paren:
-      if (!MightBeStmtExpr &&
+      if (!MightBeStmtExpr && !Line->InMacroBody &&
           Style.RemoveParentheses > FormatStyle::RPS_Leave) {
         const auto *Prev = LeftParen->Previous;
         const auto *Next = Tokens->peekNextToken();
diff --git a/clang/lib/Headers/__stddef_null.h b/clang/lib/Headers/__stddef_null.h
index 7336fdab389723..c10bd2d7d9887c 100644
--- a/clang/lib/Headers/__stddef_null.h
+++ b/clang/lib/Headers/__stddef_null.h
@@ -7,7 +7,7 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(NULL) || !__has_feature(modules)
+#if !defined(NULL) || !__building_module(_Builtin_stddef)
 
 /* linux/stddef.h will define NULL to 0. glibc (and other) headers then define
  * __need_NULL and rely on stddef.h to redefine NULL to the correct value again.
diff --git a/clang/lib/Headers/__stddef_nullptr_t.h b/clang/lib/Headers/__stddef_nullptr_t.h
index 183d394d56c1b7..7f3fbe6fe0d3a8 100644
--- a/clang/lib/Headers/__stddef_nullptr_t.h
+++ b/clang/lib/Headers/__stddef_nullptr_t.h
@@ -7,7 +7,12 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _NULLPTR_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_NULLPTR_T) ||                                                    \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _NULLPTR_T
 
 #ifdef __cplusplus
diff --git a/clang/lib/Headers/__stddef_offsetof.h b/clang/lib/Headers/__stddef_offsetof.h
index 3b347b3b92f62c..84172c6cd27352 100644
--- a/clang/lib/Headers/__stddef_offsetof.h
+++ b/clang/lib/Headers/__stddef_offsetof.h
@@ -7,6 +7,11 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef offsetof
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(offsetof) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define offsetof(t, d) __builtin_offsetof(t, d)
 #endif
diff --git a/clang/lib/Headers/__stddef_ptrdiff_t.h b/clang/lib/Headers/__stddef_ptrdiff_t.h
index 3ea6d7d2852e1c..fd3c893c66c979 100644
--- a/clang/lib/Headers/__stddef_ptrdiff_t.h
+++ b/clang/lib/Headers/__stddef_ptrdiff_t.h
@@ -7,7 +7,12 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _PTRDIFF_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_PTRDIFF_T) ||                                                    \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _PTRDIFF_T
 
 typedef __PTRDIFF_TYPE__ ptrdiff_t;
diff --git a/clang/lib/Headers/__stddef_rsize_t.h b/clang/lib/Headers/__stddef_rsize_t.h
index b6428d0c12b62a..dd433d40d9733a 100644
--- a/clang/lib/Headers/__stddef_rsize_t.h
+++ b/clang/lib/Headers/__stddef_rsize_t.h
@@ -7,7 +7,12 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _RSIZE_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_RSIZE_T) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _RSIZE_T
 
 typedef __SIZE_TYPE__ rsize_t;
diff --git a/clang/lib/Headers/__stddef_size_t.h b/clang/lib/Headers/__stddef_size_t.h
index e4a389510bcdbf..3dd7b1f3792949 100644
--- a/clang/lib/Headers/__stddef_size_t.h
+++ b/clang/lib/Headers/__stddef_size_t.h
@@ -7,7 +7,12 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _SIZE_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_SIZE_T) ||                                                       \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _SIZE_T
 
 typedef __SIZE_TYPE__ size_t;
diff --git a/clang/lib/Headers/__stddef_unreachable.h b/clang/lib/Headers/__stddef_unreachable.h
index 3e7fe01979662a..61df43e9732f8a 100644
--- a/clang/lib/Headers/__stddef_unreachable.h
+++ b/clang/lib/Headers/__stddef_unreachable.h
@@ -7,6 +7,15 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef unreachable
+#ifndef __cplusplus
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(unreachable) ||                                                   \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define unreachable() __builtin_unreachable()
 #endif
+
+#endif
diff --git a/clang/lib/Headers/__stddef_wchar_t.h b/clang/lib/Headers/__stddef_wchar_t.h
index 16a6186512c0c3..bd69f632254163 100644
--- a/clang/lib/Headers/__stddef_wchar_t.h
+++ b/clang/lib/Headers/__stddef_wchar_t.h
@@ -9,7 +9,12 @@
 
 #if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
 
-#ifndef _WCHAR_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_WCHAR_T) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _WCHAR_T
 
 #ifdef _MSC_EXTENSIONS
diff --git a/clang/lib/Headers/larchintrin.h b/clang/lib/Headers/larchintrin.h
index a613e5ca0e5ecd..f4218295919a0d 100644
--- a/clang/lib/Headers/larchintrin.h
+++ b/clang/lib/Headers/larchintrin.h
@@ -156,7 +156,7 @@ extern __inline unsigned char
   return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1);
 }
 
-extern __inline unsigned char
+extern __inline unsigned short
     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     __iocsrrd_h(unsigned int _1) {
   return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1);
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index a786689d391773..56a13f69bc0559 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -155,9 +155,9 @@ module _Builtin_intrinsics [system] [extern_c] {
 
 // Start -fbuiltin-headers-in-system-modules affected modules
 
-// The following modules all ignore their top level headers
-// when -fbuiltin-headers-in-system-modules is passed, and
-// most of those headers join system modules when present.
+// The following modules all ignore their headers when
+// -fbuiltin-headers-in-system-modules is passed, and many of
+// those headers join system modules when present.
 
 // e.g. if -fbuiltin-headers-in-system-modules is passed, then
 // float.h will not be in the _Builtin_float module (that module
@@ -190,11 +190,6 @@ module _Builtin_stdalign [system] {
   export *
 }
 
-// When -fbuiltin-headers-in-system-modules is passed, only
-// the top level headers are removed, the implementation headers
-// will always be in their submodules. That means when stdarg.h
-// is included, it will still import this module and make the
-// appropriate submodules visible.
 module _Builtin_stdarg [system] {
   textual header "stdarg.h"
 
@@ -237,6 +232,8 @@ module _Builtin_stdbool [system] {
 module _Builtin_stddef [system] {
   textual header "stddef.h"
 
+  // __stddef_max_align_t.h is always in this module, even if
+  // -fbuiltin-headers-in-system-modules is passed.
   explicit module max_align_t {
     header "__stddef_max_align_t.h"
     export *
@@ -283,9 +280,10 @@ module _Builtin_stddef [system] {
   }
 }
 
-/* wint_t is provided by <wchar.h> and not <stddef.h>. It's here
- * for compatibility, but must be explicitly requested. Therefore
- * __stddef_wint_t.h is not part of _Builtin_stddef. */
+// wint_t is provided by <wchar.h> and not <stddef.h>. It's here
+// for compatibility, but must be explicitly requested. Therefore
+// __stddef_wint_t.h is not part of _Builtin_stddef. It is always in
+// this module even if -fbuiltin-headers-in-system-modules is passed.
 module _Builtin_stddef_wint_t [system] {
   header "__stddef_wint_t.h"
   export *
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index afb2948f05ae5b..10c475f617d485 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -2498,9 +2498,12 @@ void ModuleMapParser::parseHeaderDecl(MMToken::TokenKind LeadingToken,
   }
 
   bool NeedsFramework = false;
-  // Don't add the top level headers to the builtin modules if the builtin headers
-  // belong to the system modules.
-  if (!Map.LangOpts.BuiltinHeadersInSystemModules || ActiveModule->isSubModule() || !isBuiltInModuleName(ActiveModule->Name))
+  // Don't add headers to the builtin modules if the builtin headers belong to
+  // the system modules, with the exception of __stddef_max_align_t.h which
+  // always had its own module.
+  if (!Map.LangOpts.BuiltinHeadersInSystemModules ||
+      !isBuiltInModuleName(ActiveModule->getTopLevelModuleName()) ||
+      ActiveModule->fullModuleNameIs({"_Builtin_stddef", "max_align_t"}))
     Map.addUnresolvedHeader(ActiveModule, std::move(Header), NeedsFramework);
 
   if (NeedsFramework)
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 9f82a6d073e3ba..a980f4bcbae124 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -3288,7 +3288,7 @@ void Preprocessor::HandleIfdefDirective(Token &Result,
     return;
   }
 
-  emitMacroExpansionWarnings(MacroNameTok);
+  emitMacroExpansionWarnings(MacroNameTok, /*IsIfnDef=*/true);
 
   // Check to see if this is the last token on the #if[n]def line.
   CheckEndOfDirective(isIfndef ? "ifndef" : "ifdef");
diff --git a/clang/lib/Lex/PPExpressions.cpp b/clang/lib/Lex/PPExpressions.cpp
index 1feb0eb18d71e6..8f25c67ec9dfbe 100644
--- a/clang/lib/Lex/PPExpressions.cpp
+++ b/clang/lib/Lex/PPExpressions.cpp
@@ -133,7 +133,9 @@ static bool EvaluateDefined(PPValue &Result, Token &PeekTok, DefinedTracker &DT,
   Result.Val.setIsUnsigned(false); // Result is signed intmax_t.
   DT.IncludedUndefinedIds = !Macro;
 
-  PP.emitMacroExpansionWarnings(PeekTok);
+  PP.emitMacroExpansionWarnings(
+      PeekTok,
+      (II->getName() == "INFINITY" || II->getName() == "NAN") ? true : false);
 
   // If there is a macro, mark it used.
   if (Result.Val != 0 && ValueLive)
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7833d5a2ea20ee..09b7e1c62fbd7b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -7545,47 +7545,43 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
       }
     }
 
-    // If the callee uses AArch64 SME ZA state but the caller doesn't define
-    // any, then this is an error.
-    FunctionType::ArmStateValue ArmZAState =
+    FunctionType::ArmStateValue CalleeArmZAState =
         FunctionType::getArmZAState(ExtInfo.AArch64SMEAttributes);
-    if (ArmZAState != FunctionType::ARM_None) {
+    FunctionType::ArmStateValue CalleeArmZT0State =
+        FunctionType::getArmZT0State(ExtInfo.AArch64SMEAttributes);
+    if (CalleeArmZAState != FunctionType::ARM_None ||
+        CalleeArmZT0State != FunctionType::ARM_None) {
       bool CallerHasZAState = false;
+      bool CallerHasZT0State = false;
       if (const auto *CallerFD = dyn_cast<FunctionDecl>(CurContext)) {
         auto *Attr = CallerFD->getAttr<ArmNewAttr>();
         if (Attr && Attr->isNewZA())
           CallerHasZAState = true;
-        else if (const auto *FPT =
-                     CallerFD->getType()->getAs<FunctionProtoType>())
-          CallerHasZAState = FunctionType::getArmZAState(
-                                 FPT->getExtProtoInfo().AArch64SMEAttributes) !=
-                             FunctionType::ARM_None;
-      }
-
-      if (!CallerHasZAState)
-        Diag(Loc, diag::err_sme_za_call_no_za_state);
-    }
-
-    // If the callee uses AArch64 SME ZT0 state but the caller doesn't define
-    // any, then this is an error.
-    FunctionType::ArmStateValue ArmZT0State =
-        FunctionType::getArmZT0State(ExtInfo.AArch64SMEAttributes);
-    if (ArmZT0State != FunctionType::ARM_None) {
-      bool CallerHasZT0State = false;
-      if (const auto *CallerFD = dyn_cast<FunctionDecl>(CurContext)) {
-        auto *Attr = CallerFD->getAttr<ArmNewAttr>();
         if (Attr && Attr->isNewZT0())
           CallerHasZT0State = true;
-        else if (const auto *FPT =
-                     CallerFD->getType()->getAs<FunctionProtoType>())
-          CallerHasZT0State =
+        if (const auto *FPT = CallerFD->getType()->getAs<FunctionProtoType>()) {
+          CallerHasZAState |=
+              FunctionType::getArmZAState(
+                  FPT->getExtProtoInfo().AArch64SMEAttributes) !=
+              FunctionType::ARM_None;
+          CallerHasZT0State |=
               FunctionType::getArmZT0State(
                   FPT->getExtProtoInfo().AArch64SMEAttributes) !=
               FunctionType::ARM_None;
+        }
       }
 
-      if (!CallerHasZT0State)
+      if (CalleeArmZAState != FunctionType::ARM_None && !CallerHasZAState)
+        Diag(Loc, diag::err_sme_za_call_no_za_state);
+
+      if (CalleeArmZT0State != FunctionType::ARM_None && !CallerHasZT0State)
         Diag(Loc, diag::err_sme_zt0_call_no_zt0_state);
+
+      if (CallerHasZAState && CalleeArmZAState == FunctionType::ARM_None &&
+          CalleeArmZT0State != FunctionType::ARM_None) {
+        Diag(Loc, diag::err_sme_unimplemented_za_save_restore);
+        Diag(Loc, diag::note_sme_use_preserves_za);
+      }
     }
   }
 
@@ -7643,9 +7639,8 @@ bool Sema::CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
   unsigned NumArgs = TheCall->getNumArgs();
 
   Expr *ImplicitThis = nullptr;
-  if (IsMemberOperatorCall && !FDecl->isStatic() &&
-      !FDecl->hasCXXExplicitFunctionObjectParameter()) {
-    // If this is a call to a non-static member operator, hide the first
+  if (IsMemberOperatorCall && !FDecl->hasCXXExplicitFunctionObjectParameter()) {
+    // If this is a call to a member operator, hide the first
     // argument from checkCall.
     // FIXME: Our choice of AST representation here is less than ideal.
     ImplicitThis = Args[0];
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index acfc00f4125407..88fc846c89e42c 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -612,8 +612,12 @@ bool Sema::SetupConstraintScope(
 
     // If this is a member function, make sure we get the parameters that
     // reference the original primary template.
-    if (const auto *FromMemTempl =
-            PrimaryTemplate->getInstantiatedFromMemberTemplate()) {
+    // We walk up the instantiated template chain so that nested lambdas get
+    // handled properly.
+    for (FunctionTemplateDecl *FromMemTempl =
+             PrimaryTemplate->getInstantiatedFromMemberTemplate();
+         FromMemTempl;
+         FromMemTempl = FromMemTempl->getInstantiatedFromMemberTemplate()) {
       if (addInstantiatedParametersToScope(FD, FromMemTempl->getTemplatedDecl(),
                                            Scope, MLTAL))
         return true;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index f9bf1d14bdc4f6..f5bb3e0b42e26c 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8396,28 +8396,40 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
 
   unsigned WarningDiag = diag::warn_decl_shadow;
   SourceLocation CaptureLoc;
-  if (isa<VarDecl>(D) && isa<VarDecl>(ShadowedDecl) && NewDC &&
-      isa<CXXMethodDecl>(NewDC)) {
+  if (isa<VarDecl>(D) && NewDC && isa<CXXMethodDecl>(NewDC)) {
     if (const auto *RD = dyn_cast<CXXRecordDecl>(NewDC->getParent())) {
       if (RD->isLambda() && OldDC->Encloses(NewDC->getLexicalParent())) {
-        if (RD->getLambdaCaptureDefault() == LCD_None) {
-          // Try to avoid warnings for lambdas with an explicit capture list.
+        if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl)) {
           const auto *LSI = cast<LambdaScopeInfo>(getCurFunction());
-          // Warn only when the lambda captures the shadowed decl explicitly.
-          CaptureLoc = getCaptureLocation(LSI, cast<VarDecl>(ShadowedDecl));
-          if (CaptureLoc.isInvalid())
-            WarningDiag = diag::warn_decl_shadow_uncaptured_local;
-        } else {
-          // Remember that this was shadowed so we can avoid the warning if the
-          // shadowed decl isn't captured and the warning settings allow it.
+          if (RD->getLambdaCaptureDefault() == LCD_None) {
+            // Try to avoid warnings for lambdas with an explicit capture
+            // list. Warn only when the lambda captures the shadowed decl
+            // explicitly.
+            CaptureLoc = getCaptureLocation(LSI, VD);
+            if (CaptureLoc.isInvalid())
+              WarningDiag = diag::warn_decl_shadow_uncaptured_local;
+          } else {
+            // Remember that this was shadowed so we can avoid the warning if
+            // the shadowed decl isn't captured and the warning settings allow
+            // it.
+            cast<LambdaScopeInfo>(getCurFunction())
+                ->ShadowingDecls.push_back({D, VD});
+            return;
+          }
+        }
+        if (isa<FieldDecl>(ShadowedDecl)) {
+          // If lambda can capture this, then emit default shadowing warning,
+          // Otherwise it is not really a shadowing case since field is not
+          // available in lambda's body.
+          // At this point we don't know that lambda can capture this, so
+          // remember that this was shadowed and delay until we know.
           cast<LambdaScopeInfo>(getCurFunction())
-              ->ShadowingDecls.push_back(
-                  {cast<VarDecl>(D), cast<VarDecl>(ShadowedDecl)});
+              ->ShadowingDecls.push_back({D, ShadowedDecl});
           return;
         }
       }
-
-      if (cast<VarDecl>(ShadowedDecl)->hasLocalStorage()) {
+      if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl);
+          VD && VD->hasLocalStorage()) {
         // A variable can't shadow a local variable in an enclosing scope, if
         // they are separated by a non-capturing declaration context.
         for (DeclContext *ParentDC = NewDC;
@@ -8468,19 +8480,28 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
 /// when these variables are captured by the lambda.
 void Sema::DiagnoseShadowingLambdaDecls(const LambdaScopeInfo *LSI) {
   for (const auto &Shadow : LSI->ShadowingDecls) {
-    const VarDecl *ShadowedDecl = Shadow.ShadowedDecl;
+    const NamedDecl *ShadowedDecl = Shadow.ShadowedDecl;
     // Try to avoid the warning when the shadowed decl isn't captured.
-    SourceLocation CaptureLoc = getCaptureLocation(LSI, ShadowedDecl);
     const DeclContext *OldDC = ShadowedDecl->getDeclContext();
-    Diag(Shadow.VD->getLocation(), CaptureLoc.isInvalid()
-                                       ? diag::warn_decl_shadow_uncaptured_local
-                                       : diag::warn_decl_shadow)
-        << Shadow.VD->getDeclName()
-        << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC;
-    if (!CaptureLoc.isInvalid())
-      Diag(CaptureLoc, diag::note_var_explicitly_captured_here)
-          << Shadow.VD->getDeclName() << /*explicitly*/ 0;
-    Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+    if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl)) {
+      SourceLocation CaptureLoc = getCaptureLocation(LSI, VD);
+      Diag(Shadow.VD->getLocation(),
+           CaptureLoc.isInvalid() ? diag::warn_decl_shadow_uncaptured_local
+                                  : diag::warn_decl_shadow)
+          << Shadow.VD->getDeclName()
+          << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC;
+      if (CaptureLoc.isValid())
+        Diag(CaptureLoc, diag::note_var_explicitly_captured_here)
+            << Shadow.VD->getDeclName() << /*explicitly*/ 0;
+      Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+    } else if (isa<FieldDecl>(ShadowedDecl)) {
+      Diag(Shadow.VD->getLocation(),
+           LSI->isCXXThisCaptured() ? diag::warn_decl_shadow
+                                    : diag::warn_decl_shadow_uncaptured_local)
+          << Shadow.VD->getDeclName()
+          << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC;
+      Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+    }
   }
 }
 
@@ -12752,7 +12773,8 @@ namespace {
       }
 
       if (OpaqueValueExpr *OVE = dyn_cast<OpaqueValueExpr>(E)) {
-        HandleValue(OVE->getSourceExpr());
+        if (Expr *SE = OVE->getSourceExpr())
+          HandleValue(SE);
         return;
       }
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 6413a48f809ac9..4cce0abc231505 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -11142,7 +11142,8 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
       if (VecType->getVectorKind() == VectorKind::SveFixedLengthData ||
           VecType->getVectorKind() == VectorKind::SveFixedLengthPredicate)
         return true;
-      if (VecType->getVectorKind() == VectorKind::RVVFixedLengthData) {
+      if (VecType->getVectorKind() == VectorKind::RVVFixedLengthData ||
+          VecType->getVectorKind() == VectorKind::RVVFixedLengthMask) {
         SVEorRVV = 1;
         return true;
       }
@@ -11173,7 +11174,8 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
             SecondVecType->getVectorKind() ==
                 VectorKind::SveFixedLengthPredicate)
           return true;
-        if (SecondVecType->getVectorKind() == VectorKind::RVVFixedLengthData) {
+        if (SecondVecType->getVectorKind() == VectorKind::RVVFixedLengthData ||
+            SecondVecType->getVectorKind() == VectorKind::RVVFixedLengthMask) {
           SVEorRVV = 1;
           return true;
         }
@@ -14060,7 +14062,7 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
     Expr::EvalResult EVResult;
     if (RHS.get()->EvaluateAsInt(EVResult, Context)) {
       llvm::APSInt Result = EVResult.Val.getInt();
-      if ((getLangOpts().Bool && !RHS.get()->getType()->isBooleanType() &&
+      if ((getLangOpts().CPlusPlus && !RHS.get()->getType()->isBooleanType() &&
            !RHS.get()->getExprLoc().isMacroID()) ||
           (Result != 0 && Result != 1)) {
         Diag(Loc, diag::warn_logical_instead_of_bitwise)
@@ -18292,7 +18294,6 @@ void Sema::CheckUnusedVolatileAssignment(Expr *E) {
 }
 
 void Sema::MarkExpressionAsImmediateEscalating(Expr *E) {
-  assert(!FunctionScopes.empty() && "Expected a function scope");
   assert(getLangOpts().CPlusPlus20 &&
          ExprEvalContexts.back().InImmediateEscalatingFunctionContext &&
          "Cannot mark an immediate escalating expression outside of an "
@@ -18309,7 +18310,8 @@ void Sema::MarkExpressionAsImmediateEscalating(Expr *E) {
   } else {
     assert(false && "expected an immediately escalating expression");
   }
-  getCurFunction()->FoundImmediateEscalatingExpression = true;
+  if (FunctionScopeInfo *FI = getCurFunction())
+    FI->FoundImmediateEscalatingExpression = true;
 }
 
 ExprResult Sema::CheckForImmediateInvocation(ExprResult E, FunctionDecl *Decl) {
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 91e4cb7b68a24a..457fa377355a97 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -4200,7 +4200,7 @@ static OverloadingResult ResolveConstructorOverload(
 /// \param IsListInit     Is this list-initialization?
 /// \param IsInitListCopy Is this non-list-initialization resulting from a
 ///                       list-initialization from {x} where x is the same
-///                       aggregate type as the entity?
+///                       type as the entity?
 static void TryConstructorInitialization(Sema &S,
                                          const InitializedEntity &Entity,
                                          const InitializationKind &Kind,
@@ -4230,14 +4230,6 @@ static void TryConstructorInitialization(Sema &S,
         Entity.getKind() !=
             InitializedEntity::EK_LambdaToBlockConversionBlockElement);
 
-  bool CopyElisionPossible = false;
-  auto ElideConstructor = [&] {
-    // Convert qualifications if necessary.
-    Sequence.AddQualificationConversionStep(DestType, VK_PRValue);
-    if (ILE)
-      Sequence.RewrapReferenceInitList(DestType, ILE);
-  };
-
   // C++17 [dcl.init]p17:
   //     - If the initializer expression is a prvalue and the cv-unqualified
   //       version of the source type is the same class as the class of the
@@ -4250,17 +4242,11 @@ static void TryConstructorInitialization(Sema &S,
   if (S.getLangOpts().CPlusPlus17 && !RequireActualConstructor &&
       UnwrappedArgs.size() == 1 && UnwrappedArgs[0]->isPRValue() &&
       S.Context.hasSameUnqualifiedType(UnwrappedArgs[0]->getType(), DestType)) {
-    if (ILE && !DestType->isAggregateType()) {
-      // CWG2311: T{ prvalue_of_type_T } is not eligible for copy elision
-      // Make this an elision if this won't call an initializer-list
-      // constructor. (Always on an aggregate type or check constructors first.)
-      assert(!IsInitListCopy &&
-             "IsInitListCopy only possible with aggregate types");
-      CopyElisionPossible = true;
-    } else {
-      ElideConstructor();
-      return;
-    }
+    // Convert qualifications if necessary.
+    Sequence.AddQualificationConversionStep(DestType, VK_PRValue);
+    if (ILE)
+      Sequence.RewrapReferenceInitList(DestType, ILE);
+    return;
   }
 
   const RecordType *DestRecordType = DestType->getAs<RecordType>();
@@ -4305,12 +4291,6 @@ static void TryConstructorInitialization(Sema &S,
           S, Kind.getLocation(), Args, CandidateSet, DestType, Ctors, Best,
           CopyInitialization, AllowExplicit,
           /*OnlyListConstructors=*/true, IsListInit, RequireActualConstructor);
-
-    if (CopyElisionPossible && Result == OR_No_Viable_Function) {
-      // No initializer list candidate
-      ElideConstructor();
-      return;
-    }
   }
 
   // C++11 [over.match.list]p1:
@@ -4592,9 +4572,9 @@ static void TryListInitialization(Sema &S,
     return;
   }
 
-  // C++11 [dcl.init.list]p3, per DR1467 and DR2137:
-  // - If T is an aggregate class and the initializer list has a single element
-  //   of type cv U, where U is T or a class derived from T, the object is
+  // C++11 [dcl.init.list]p3, per DR1467:
+  // - If T is a class type and the initializer list has a single element of
+  //   type cv U, where U is T or a class derived from T, the object is
   //   initialized from that element (by copy-initialization for
   //   copy-list-initialization, or by direct-initialization for
   //   direct-list-initialization).
@@ -4605,7 +4585,7 @@ static void TryListInitialization(Sema &S,
   // - Otherwise, if T is an aggregate, [...] (continue below).
   if (S.getLangOpts().CPlusPlus11 && InitList->getNumInits() == 1 &&
       !IsDesignatedInit) {
-    if (DestType->isRecordType() && DestType->isAggregateType()) {
+    if (DestType->isRecordType()) {
       QualType InitType = InitList->getInit(0)->getType();
       if (S.Context.hasSameUnqualifiedType(InitType, DestType) ||
           S.IsDerivedFrom(InitList->getBeginLoc(), InitType, DestType)) {
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 030878899b8122..b708272ebe7d87 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1568,37 +1568,19 @@ TryUserDefinedConversion(Sema &S, Expr *From, QualType ToType,
     //   called for those cases.
     if (CXXConstructorDecl *Constructor
           = dyn_cast<CXXConstructorDecl>(ICS.UserDefined.ConversionFunction)) {
-      QualType FromType;
-      SourceLocation FromLoc;
-      // C++11 [over.ics.list]p6, per DR2137:
-      // C++17 [over.ics.list]p6:
-      //   If C is not an initializer-list constructor and the initializer list
-      //   has a single element of type cv U, where U is X or a class derived
-      //   from X, the implicit conversion sequence has Exact Match rank if U is
-      //   X, or Conversion rank if U is derived from X.
-      if (const auto *InitList = dyn_cast<InitListExpr>(From);
-          InitList && InitList->getNumInits() == 1 &&
-          !S.isInitListConstructor(Constructor)) {
-        const Expr *SingleInit = InitList->getInit(0);
-        FromType = SingleInit->getType();
-        FromLoc = SingleInit->getBeginLoc();
-      } else {
-        FromType = From->getType();
-        FromLoc = From->getBeginLoc();
-      }
-      QualType FromCanon =
-          S.Context.getCanonicalType(FromType.getUnqualifiedType());
+      QualType FromCanon
+        = S.Context.getCanonicalType(From->getType().getUnqualifiedType());
       QualType ToCanon
         = S.Context.getCanonicalType(ToType).getUnqualifiedType();
       if (Constructor->isCopyConstructor() &&
           (FromCanon == ToCanon ||
-           S.IsDerivedFrom(FromLoc, FromCanon, ToCanon))) {
+           S.IsDerivedFrom(From->getBeginLoc(), FromCanon, ToCanon))) {
         // Turn this into a "standard" conversion sequence, so that it
         // gets ranked with standard conversion sequences.
         DeclAccessPair Found = ICS.UserDefined.FoundConversionFunction;
         ICS.setStandard();
         ICS.Standard.setAsIdentityConversion();
-        ICS.Standard.setFromType(FromType);
+        ICS.Standard.setFromType(From->getType());
         ICS.Standard.setAllToTypes(ToType);
         ICS.Standard.CopyConstructor = Constructor;
         ICS.Standard.FoundCopyConstructor = Found;
@@ -5324,18 +5306,18 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType,
       IsDesignatedInit)
     return Result;
 
-  // Per DR1467 and DR2137:
-  //   If the parameter type is an aggregate class X and the initializer list
-  //   has a single element of type cv U, where U is X or a class derived from
-  //   X, the implicit conversion sequence is the one required to convert the
-  //   element to the parameter type.
+  // Per DR1467:
+  //   If the parameter type is a class X and the initializer list has a single
+  //   element of type cv U, where U is X or a class derived from X, the
+  //   implicit conversion sequence is the one required to convert the element
+  //   to the parameter type.
   //
   //   Otherwise, if the parameter type is a character array [... ]
   //   and the initializer list has a single element that is an
   //   appropriately-typed string literal (8.5.2 [dcl.init.string]), the
   //   implicit conversion sequence is the identity conversion.
   if (From->getNumInits() == 1 && !IsDesignatedInit) {
-    if (ToType->isRecordType() && ToType->isAggregateType()) {
+    if (ToType->isRecordType()) {
       QualType InitType = From->getInit(0)->getType();
       if (S.Context.hasSameUnqualifiedType(InitType, ToType) ||
           S.IsDerivedFrom(From->getBeginLoc(), InitType, ToType))
@@ -5682,10 +5664,15 @@ static ImplicitConversionSequence TryObjectArgumentInitialization(
   assert(FromType->isRecordType());
 
   QualType ClassType = S.Context.getTypeDeclType(ActingContext);
-  // [class.dtor]p2: A destructor can be invoked for a const, volatile or
-  //                 const volatile object.
+  // C++98 [class.dtor]p2:
+  //   A destructor can be invoked for a const, volatile or const volatile
+  //   object.
+  // C++98 [over.match.funcs]p4:
+  //   For static member functions, the implicit object parameter is considered
+  //   to match any object (since if the function is selected, the object is
+  //   discarded).
   Qualifiers Quals = Method->getMethodQualifiers();
-  if (isa<CXXDestructorDecl>(Method)) {
+  if (isa<CXXDestructorDecl>(Method) || Method->isStatic()) {
     Quals.addConst();
     Quals.addVolatile();
   }
@@ -14483,6 +14470,23 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
                                        CurFPFeatureOverrides());
   }
 
+  // If this is the .* operator, which is not overloadable, just
+  // create a built-in binary operator.
+  if (Opc == BO_PtrMemD) {
+    auto CheckPlaceholder = [&](Expr *&Arg) {
+      ExprResult Res = CheckPlaceholderExpr(Arg);
+      if (Res.isUsable())
+        Arg = Res.get();
+      return !Res.isUsable();
+    };
+
+    // CreateBuiltinBinOp() doesn't like it if we tell it to create a '.*'
+    // expression that contains placeholders (in either the LHS or RHS).
+    if (CheckPlaceholder(Args[0]) || CheckPlaceholder(Args[1]))
+      return ExprError();
+    return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
+  }
+
   // Always do placeholder-like conversions on the RHS.
   if (checkPlaceholderForOverload(*this, Args[1]))
     return ExprError();
@@ -14502,11 +14506,6 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
   if (Opc == BO_Assign && !Args[0]->getType()->isOverloadableType())
     return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
 
-  // If this is the .* operator, which is not overloadable, just
-  // create a built-in binary operator.
-  if (Opc == BO_PtrMemD)
-    return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
-
   // Build the overload set.
   OverloadCandidateSet CandidateSet(OpLoc, OverloadCandidateSet::CSK_Operator,
                                     OverloadCandidateSet::OperatorRewriteInfo(
@@ -15079,7 +15078,7 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc,
         CXXMethodDecl *Method = cast<CXXMethodDecl>(FnDecl);
         SmallVector<Expr *, 2> MethodArgs;
 
-        // Handle 'this' parameter if the selected function is not static.
+        // Initialize the object parameter.
         if (Method->isExplicitObjectMemberFunction()) {
           ExprResult Res =
               InitializeExplicitObjectArgument(*this, Args[0], Method);
@@ -15087,7 +15086,7 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc,
             return ExprError();
           Args[0] = Res.get();
           ArgExpr = Args;
-        } else if (Method->isInstance()) {
+        } else {
           ExprResult Arg0 = PerformImplicitObjectArgumentInitialization(
               Args[0], /*Qualifier=*/nullptr, Best->FoundDecl, Method);
           if (Arg0.isInvalid())
@@ -15115,15 +15114,9 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc,
         ExprValueKind VK = Expr::getValueKindForType(ResultTy);
         ResultTy = ResultTy.getNonLValueExprType(Context);
 
-        CallExpr *TheCall;
-        if (Method->isInstance())
-          TheCall = CXXOperatorCallExpr::Create(
-              Context, OO_Subscript, FnExpr.get(), MethodArgs, ResultTy, VK,
-              RLoc, CurFPFeatureOverrides());
-        else
-          TheCall =
-              CallExpr::Create(Context, FnExpr.get(), MethodArgs, ResultTy, VK,
-                               RLoc, CurFPFeatureOverrides());
+        CallExpr *TheCall = CXXOperatorCallExpr::Create(
+            Context, OO_Subscript, FnExpr.get(), MethodArgs, ResultTy, VK, RLoc,
+            CurFPFeatureOverrides());
 
         if (CheckCallReturnType(FnDecl->getReturnType(), LLoc, TheCall, FnDecl))
           return ExprError();
@@ -15751,15 +15744,13 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj,
 
   bool IsError = false;
 
-  // Initialize the implicit object parameter if needed.
-  // Since C++23, this could also be a call to a static call operator
-  // which we emit as a regular CallExpr.
+  // Initialize the object parameter.
   llvm::SmallVector<Expr *, 8> NewArgs;
   if (Method->isExplicitObjectMemberFunction()) {
     // FIXME: we should do that during the definition of the lambda when we can.
     DiagnoseInvalidExplicitObjectParameterInLambda(Method);
     PrepareExplicitObjectArgument(*this, Method, Obj, Args, NewArgs);
-  } else if (Method->isInstance()) {
+  } else {
     ExprResult ObjRes = PerformImplicitObjectArgumentInitialization(
         Object.get(), /*Qualifier=*/nullptr, Best->FoundDecl, Method);
     if (ObjRes.isInvalid())
@@ -15793,14 +15784,9 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj,
   ExprValueKind VK = Expr::getValueKindForType(ResultTy);
   ResultTy = ResultTy.getNonLValueExprType(Context);
 
-  CallExpr *TheCall;
-  if (Method->isInstance())
-    TheCall = CXXOperatorCallExpr::Create(Context, OO_Call, NewFn.get(),
-                                          MethodArgs, ResultTy, VK, RParenLoc,
-                                          CurFPFeatureOverrides());
-  else
-    TheCall = CallExpr::Create(Context, NewFn.get(), MethodArgs, ResultTy, VK,
-                               RParenLoc, CurFPFeatureOverrides());
+  CallExpr *TheCall = CXXOperatorCallExpr::Create(
+      Context, OO_Call, NewFn.get(), MethodArgs, ResultTy, VK, RParenLoc,
+      CurFPFeatureOverrides());
 
   if (CheckCallReturnType(Method->getReturnType(), LParenLoc, TheCall, Method))
     return true;
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 9bfa71dc8bcf1d..b619f5d729e86b 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1830,7 +1830,27 @@ static TemplateParameterList *GetTemplateParameterList(TemplateDecl *TD) {
   // Make sure we get the template parameter list from the most
   // recent declaration, since that is the only one that is guaranteed to
   // have all the default template argument information.
-  return cast<TemplateDecl>(TD->getMostRecentDecl())->getTemplateParameters();
+  Decl *D = TD->getMostRecentDecl();
+  // C++11 [temp.param]p12:
+  // A default template argument shall not be specified in a friend class
+  // template declaration.
+  //
+  // Skip past friend *declarations* because they are not supposed to contain
+  // default template arguments. Moreover, these declarations may introduce
+  // template parameters living in different template depths than the
+  // corresponding template parameters in TD, causing unmatched constraint
+  // substitution.
+  //
+  // FIXME: Diagnose such cases within a class template:
+  //  template <class T>
+  //  struct S {
+  //    template <class = void> friend struct C;
+  //  };
+  //  template struct S<int>;
+  while (D->getFriendObjectKind() != Decl::FriendObjectKind::FOK_None &&
+         D->getPreviousDecl())
+    D = D->getPreviousDecl();
+  return cast<TemplateDecl>(D)->getTemplateParameters();
 }
 
 DeclResult Sema::CheckClassTemplate(
@@ -7412,9 +7432,9 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
     if (ArgResult.isInvalid())
       return ExprError();
 
-    // Prior to C++20, enforce restrictions on possible template argument
-    // values.
-    if (!getLangOpts().CPlusPlus20 && Value.isLValue()) {
+    if (Value.isLValue()) {
+      APValue::LValueBase Base = Value.getLValueBase();
+      auto *VD = const_cast<ValueDecl *>(Base.dyn_cast<const ValueDecl *>());
       //   For a non-type template-parameter of pointer or reference type,
       //   the value of the constant expression shall not refer to
       assert(ParamType->isPointerType() || ParamType->isReferenceType() ||
@@ -7423,8 +7443,6 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
       // -- a string literal
       // -- the result of a typeid expression, or
       // -- a predefined __func__ variable
-      APValue::LValueBase Base = Value.getLValueBase();
-      auto *VD = const_cast<ValueDecl *>(Base.dyn_cast<const ValueDecl *>());
       if (Base &&
           (!VD ||
            isa<LifetimeExtendedTemporaryDecl, UnnamedGlobalConstantDecl>(VD))) {
@@ -7432,24 +7450,30 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
             << Arg->getSourceRange();
         return ExprError();
       }
-      // -- a subobject [until C++20]
-      if (Value.hasLValuePath() && Value.getLValuePath().size() == 1 &&
-          VD && VD->getType()->isArrayType() &&
+
+      if (Value.hasLValuePath() && Value.getLValuePath().size() == 1 && VD &&
+          VD->getType()->isArrayType() &&
           Value.getLValuePath()[0].getAsArrayIndex() == 0 &&
           !Value.isLValueOnePastTheEnd() && ParamType->isPointerType()) {
-        // Per defect report (no number yet):
-        //   ... other than a pointer to the first element of a complete array
-        //       object.
-      } else if (!Value.hasLValuePath() || Value.getLValuePath().size() ||
-                 Value.isLValueOnePastTheEnd()) {
-        Diag(StartLoc, diag::err_non_type_template_arg_subobject)
-          << Value.getAsString(Context, ParamType);
-        return ExprError();
+        SugaredConverted = TemplateArgument(VD, ParamType);
+        CanonicalConverted = TemplateArgument(
+            cast<ValueDecl>(VD->getCanonicalDecl()), CanonParamType);
+        return ArgResult.get();
+      }
+
+      // -- a subobject [until C++20]
+      if (!getLangOpts().CPlusPlus20) {
+        if (!Value.hasLValuePath() || Value.getLValuePath().size() ||
+            Value.isLValueOnePastTheEnd()) {
+          Diag(StartLoc, diag::err_non_type_template_arg_subobject)
+              << Value.getAsString(Context, ParamType);
+          return ExprError();
+        }
+        assert((VD || !ParamType->isReferenceType()) &&
+               "null reference should not be a constant expression");
+        assert((!VD || !ParamType->isNullPtrType()) &&
+               "non-null value of type nullptr_t?");
       }
-      assert((VD || !ParamType->isReferenceType()) &&
-             "null reference should not be a constant expression");
-      assert((!VD || !ParamType->isNullPtrType()) &&
-             "non-null value of type nullptr_t?");
     }
 
     if (Value.isAddrLabelDiff())
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 9cb6c0a4ef248e..92086d7277fd1f 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -8646,21 +8646,30 @@ static void HandleRISCVRVVVectorBitsTypeAttr(QualType &CurType,
 
   ASTContext::BuiltinVectorTypeInfo Info =
       S.Context.getBuiltinVectorTypeInfo(CurType->castAs<BuiltinType>());
-  unsigned EltSize = S.Context.getTypeSize(Info.ElementType);
   unsigned MinElts = Info.EC.getKnownMinValue();
 
+  VectorKind VecKind = VectorKind::RVVFixedLengthData;
+  unsigned ExpectedSize = VScale->first * MinElts;
+  QualType EltType = CurType->getRVVEltType(S.Context);
+  unsigned EltSize = S.Context.getTypeSize(EltType);
+  unsigned NumElts;
+  if (Info.ElementType == S.Context.BoolTy) {
+    NumElts = VecSize / S.Context.getCharWidth();
+    VecKind = VectorKind::RVVFixedLengthMask;
+  } else {
+    ExpectedSize *= EltSize;
+    NumElts = VecSize / EltSize;
+  }
+
   // The attribute vector size must match -mrvv-vector-bits.
-  unsigned ExpectedSize = VScale->first * MinElts * EltSize;
-  if (VecSize != ExpectedSize) {
+  if (ExpectedSize % 8 != 0 || VecSize != ExpectedSize) {
     S.Diag(Attr.getLoc(), diag::err_attribute_bad_rvv_vector_size)
         << VecSize << ExpectedSize;
     Attr.setInvalid();
     return;
   }
 
-  VectorKind VecKind = VectorKind::RVVFixedLengthData;
-  VecSize /= EltSize;
-  CurType = S.Context.getVectorType(Info.ElementType, VecSize, VecKind);
+  CurType = S.Context.getVectorType(EltType, NumElts, VecKind);
 }
 
 /// Handle OpenCL Access Qualifier Attribute.
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index fecd94e875f671..490b8cb10a4841 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -9743,6 +9743,9 @@ void ASTReader::finishPendingActions() {
 
         if (!FD->isLateTemplateParsed() &&
             !NonConstDefn->isLateTemplateParsed() &&
+            // We only perform ODR checks for decls not in the explicit
+            // global module fragment.
+            !FD->shouldSkipCheckingODR() &&
             FD->getODRHash() != NonConstDefn->getODRHash()) {
           if (!isa<CXXMethodDecl>(FD)) {
             PendingFunctionOdrMergeFailures[FD].push_back(NonConstDefn);
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index a149d82153037f..110f55f8c0f49a 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -800,12 +800,15 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
   BitsUnpacker EnumDeclBits(Record.readInt());
   ED->setNumPositiveBits(EnumDeclBits.getNextBits(/*Width=*/8));
   ED->setNumNegativeBits(EnumDeclBits.getNextBits(/*Width=*/8));
+  bool ShouldSkipCheckingODR = EnumDeclBits.getNextBit();
   ED->setScoped(EnumDeclBits.getNextBit());
   ED->setScopedUsingClassTag(EnumDeclBits.getNextBit());
   ED->setFixed(EnumDeclBits.getNextBit());
 
-  ED->setHasODRHash(true);
-  ED->ODRHash = Record.readInt();
+  if (!ShouldSkipCheckingODR) {
+    ED->setHasODRHash(true);
+    ED->ODRHash = Record.readInt();
+  }
 
   // If this is a definition subject to the ODR, and we already have a
   // definition, merge this one into it.
@@ -827,7 +830,10 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
       Reader.MergedDeclContexts.insert(std::make_pair(ED, OldDef));
       ED->demoteThisDefinitionToDeclaration();
       Reader.mergeDefinitionVisibility(OldDef, ED);
-      if (OldDef->getODRHash() != ED->getODRHash())
+      // We don't want to check the ODR hash value for declarations from global
+      // module fragment.
+      if (!ED->shouldSkipCheckingODR() &&
+          OldDef->getODRHash() != ED->getODRHash())
         Reader.PendingEnumOdrMergeFailures[OldDef].push_back(ED);
     } else {
       OldDef = ED;
@@ -866,6 +872,9 @@ ASTDeclReader::VisitRecordDeclImpl(RecordDecl *RD) {
 
 void ASTDeclReader::VisitRecordDecl(RecordDecl *RD) {
   VisitRecordDeclImpl(RD);
+  // We should only reach here if we're in C/Objective-C. There is no
+  // global module fragment.
+  assert(!RD->shouldSkipCheckingODR());
   RD->setODRHash(Record.readInt());
 
   // Maintain the invariant of a redeclaration chain containing only
@@ -1065,6 +1074,7 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
 
   FD->setCachedLinkage((Linkage)FunctionDeclBits.getNextBits(/*Width=*/3));
   FD->setStorageClass((StorageClass)FunctionDeclBits.getNextBits(/*Width=*/3));
+  bool ShouldSkipCheckingODR = FunctionDeclBits.getNextBit();
   FD->setInlineSpecified(FunctionDeclBits.getNextBit());
   FD->setImplicitlyInline(FunctionDeclBits.getNextBit());
   FD->setHasSkippedBody(FunctionDeclBits.getNextBit());
@@ -1094,8 +1104,10 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   if (FD->isExplicitlyDefaulted())
     FD->setDefaultLoc(readSourceLocation());
 
-  FD->ODRHash = Record.readInt();
-  FD->setHasODRHash(true);
+  if (!ShouldSkipCheckingODR) {
+    FD->ODRHash = Record.readInt();
+    FD->setHasODRHash(true);
+  }
 
   if (FD->isDefaulted()) {
     if (unsigned NumLookups = Record.readInt()) {
@@ -1963,6 +1975,8 @@ void ASTDeclReader::ReadCXXDefinitionData(
 
   BitsUnpacker CXXRecordDeclBits = Record.readInt();
 
+  bool ShouldSkipCheckingODR = CXXRecordDeclBits.getNextBit();
+
 #define FIELD(Name, Width, Merge)                                              \
   if (!CXXRecordDeclBits.canGetNextNBits(Width))                         \
     CXXRecordDeclBits.updateValue(Record.readInt());                           \
@@ -1971,9 +1985,12 @@ void ASTDeclReader::ReadCXXDefinitionData(
 #include "clang/AST/CXXRecordDeclDefinitionBits.def"
 #undef FIELD
 
-  // Note: the caller has deserialized the IsLambda bit already.
-  Data.ODRHash = Record.readInt();
-  Data.HasODRHash = true;
+  // We only perform ODR checks for decls not in GMF.
+  if (!ShouldSkipCheckingODR) {
+    // Note: the caller has deserialized the IsLambda bit already.
+    Data.ODRHash = Record.readInt();
+    Data.HasODRHash = true;
+  }
 
   if (Record.readInt()) {
     Reader.DefinitionSource[D] =
@@ -2134,6 +2151,10 @@ void ASTDeclReader::MergeDefinitionData(
     }
   }
 
+  // We don't want to check ODR for decls in the global module fragment.
+  if (MergeDD.Definition->shouldSkipCheckingODR())
+    return;
+
   if (D->getODRHash() != MergeDD.ODRHash) {
     DetectedOdrViolation = true;
   }
@@ -3498,11 +3519,14 @@ ASTDeclReader::FindExistingResult ASTDeclReader::findExisting(NamedDecl *D) {
   // If this declaration is from a merged context, make a note that we need to
   // check that the canonical definition of that context contains the decl.
   //
+  // Note that we don't perform ODR checks for decls from the global module
+  // fragment.
+  //
   // FIXME: We should do something similar if we merge two definitions of the
   // same template specialization into the same CXXRecordDecl.
   auto MergedDCIt = Reader.MergedDeclContexts.find(D->getLexicalDeclContext());
   if (MergedDCIt != Reader.MergedDeclContexts.end() &&
-      MergedDCIt->second == D->getDeclContext())
+      !D->shouldSkipCheckingODR() && MergedDCIt->second == D->getDeclContext())
     Reader.PendingOdrMergeChecks.push_back(D);
 
   return FindExistingResult(Reader, D, /*Existing=*/nullptr,
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 03bddfe0f5047d..378a1f86bd5342 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6010,6 +6010,9 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
 
   BitsPacker DefinitionBits;
 
+  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
+  DefinitionBits.addBit(ShouldSkipCheckingODR);
+
 #define FIELD(Name, Width, Merge)                                              \
   if (!DefinitionBits.canWriteNextNBits(Width)) {                              \
     Record->push_back(DefinitionBits);                                         \
@@ -6022,8 +6025,11 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
 
   Record->push_back(DefinitionBits);
 
-  // getODRHash will compute the ODRHash if it has not been previously computed.
-  Record->push_back(D->getODRHash());
+  // We only perform ODR checks for decls not in GMF.
+  if (!ShouldSkipCheckingODR)
+    // getODRHash will compute the ODRHash if it has not been previously
+    // computed.
+    Record->push_back(D->getODRHash());
 
   bool ModulesDebugInfo =
       Writer->Context->getLangOpts().ModulesDebugInfo && !D->isDependentType();
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index bb1f51786d2813..42583c09f009e0 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -488,12 +488,16 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
   BitsPacker EnumDeclBits;
   EnumDeclBits.addBits(D->getNumPositiveBits(), /*BitWidth=*/8);
   EnumDeclBits.addBits(D->getNumNegativeBits(), /*BitWidth=*/8);
+  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
+  EnumDeclBits.addBit(ShouldSkipCheckingODR);
   EnumDeclBits.addBit(D->isScoped());
   EnumDeclBits.addBit(D->isScopedUsingClassTag());
   EnumDeclBits.addBit(D->isFixed());
   Record.push_back(EnumDeclBits);
 
-  Record.push_back(D->getODRHash());
+  // We only perform ODR checks for decls not in GMF.
+  if (!ShouldSkipCheckingODR)
+    Record.push_back(D->getODRHash());
 
   if (MemberSpecializationInfo *MemberInfo = D->getMemberSpecializationInfo()) {
     Record.AddDeclRef(MemberInfo->getInstantiatedFrom());
@@ -510,7 +514,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
       !D->isTopLevelDeclInObjCContainer() &&
       !CXXRecordDecl::classofKind(D->getKind()) &&
       !D->getIntegerTypeSourceInfo() && !D->getMemberSpecializationInfo() &&
-      !needsAnonymousDeclarationNumber(D) &&
+      !needsAnonymousDeclarationNumber(D) && !D->shouldSkipCheckingODR() &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier)
     AbbrevToUse = Writer.getDeclEnumAbbrev();
 
@@ -676,6 +680,8 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   // FIXME: stable encoding
   FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3);
   FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3);
+  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
+  FunctionDeclBits.addBit(ShouldSkipCheckingODR);
   FunctionDeclBits.addBit(D->isInlineSpecified());
   FunctionDeclBits.addBit(D->isInlined());
   FunctionDeclBits.addBit(D->hasSkippedBody());
@@ -701,7 +707,9 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   if (D->isExplicitlyDefaulted())
     Record.AddSourceLocation(D->getDefaultLoc());
 
-  Record.push_back(D->getODRHash());
+  // We only perform ODR checks for decls not in GMF.
+  if (!ShouldSkipCheckingODR)
+    Record.push_back(D->getODRHash());
 
   if (D->isDefaulted()) {
     if (auto *FDI = D->getDefaultedFunctionInfo()) {
@@ -1506,7 +1514,8 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
       D->getFirstDecl() == D->getMostRecentDecl() && !D->isInvalidDecl() &&
       !D->hasAttrs() && !D->isTopLevelDeclInObjCContainer() &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier &&
-      !D->hasExtInfo() && !D->isExplicitlyDefaulted()) {
+      !D->shouldSkipCheckingODR() && !D->hasExtInfo() &&
+      !D->isExplicitlyDefaulted()) {
     if (D->getTemplatedKind() == FunctionDecl::TK_NonTemplate ||
         D->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate ||
         D->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization ||
@@ -2132,12 +2141,13 @@ getFunctionDeclAbbrev(serialization::DeclCode Code) {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 11)); // IDNS
   Abv->Add(BitCodeAbbrevOp(
       BitCodeAbbrevOp::Fixed,
-      27)); // Packed Function Bits: StorageClass, Inline, InlineSpecified,
+      28)); // Packed Function Bits: StorageClass, Inline, InlineSpecified,
             // VirtualAsWritten, Pure, HasInheritedProto, HasWrittenProto,
             // Deleted, Trivial, TrivialForCall, Defaulted, ExplicitlyDefaulted,
             // IsIneligibleOrNotSelected, ImplicitReturnZero, Constexpr,
             // UsesSEHTry, SkippedBody, MultiVersion, LateParsed,
-            // FriendConstraintRefersToEnclosingTemplate, Linkage
+            // FriendConstraintRefersToEnclosingTemplate, Linkage,
+            // ShouldSkipCheckingODR
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // LocEnd
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // ODRHash
   // This Array slurps the rest of the record. Fortunately we want to encode
@@ -2264,7 +2274,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // AddTypeRef
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // IntegerType
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // getPromotionType
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 19)); // Enum Decl Bits
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 20)); // Enum Decl Bits
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));// ODRHash
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // InstantiatedMembEnum
   // DC
diff --git a/clang/lib/StaticAnalyzer/Checkers/Taint.cpp b/clang/lib/StaticAnalyzer/Checkers/Taint.cpp
index 4edb671753bf45..6362c82b009d72 100644
--- a/clang/lib/StaticAnalyzer/Checkers/Taint.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/Taint.cpp
@@ -216,21 +216,17 @@ std::vector<SymbolRef> taint::getTaintedSymbolsImpl(ProgramStateRef State,
   std::vector<SymbolRef> TaintedSymbols;
   if (!Reg)
     return TaintedSymbols;
-  // Element region (array element) is tainted if either the base or the offset
-  // are tainted.
+
+  // Element region (array element) is tainted if the offset is tainted.
   if (const ElementRegion *ER = dyn_cast<ElementRegion>(Reg)) {
     std::vector<SymbolRef> TaintedIndex =
         getTaintedSymbolsImpl(State, ER->getIndex(), K, returnFirstOnly);
     llvm::append_range(TaintedSymbols, TaintedIndex);
     if (returnFirstOnly && !TaintedSymbols.empty())
       return TaintedSymbols; // return early if needed
-    std::vector<SymbolRef> TaintedSuperRegion =
-        getTaintedSymbolsImpl(State, ER->getSuperRegion(), K, returnFirstOnly);
-    llvm::append_range(TaintedSymbols, TaintedSuperRegion);
-    if (returnFirstOnly && !TaintedSymbols.empty())
-      return TaintedSymbols; // return early if needed
   }
 
+  // Symbolic region is tainted if the corresponding symbol is tainted.
   if (const SymbolicRegion *SR = dyn_cast<SymbolicRegion>(Reg)) {
     std::vector<SymbolRef> TaintedRegions =
         getTaintedSymbolsImpl(State, SR->getSymbol(), K, returnFirstOnly);
@@ -239,6 +235,8 @@ std::vector<SymbolRef> taint::getTaintedSymbolsImpl(ProgramStateRef State,
       return TaintedSymbols; // return early if needed
   }
 
+  // Any subregion (including Element and Symbolic regions) is tainted if its
+  // super-region is tainted.
   if (const SubRegion *ER = dyn_cast<SubRegion>(Reg)) {
     std::vector<SymbolRef> TaintedSubRegions =
         getTaintedSymbolsImpl(State, ER->getSuperRegion(), K, returnFirstOnly);
@@ -318,4 +316,4 @@ std::vector<SymbolRef> taint::getTaintedSymbolsImpl(ProgramStateRef State,
     }
   }
   return TaintedSymbols;
-}
\ No newline at end of file
+}
diff --git a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp
index e5dd907c660d8e..b2947f590c4ec1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp
@@ -205,8 +205,12 @@ void InvalidPtrChecker::postPreviousReturnInvalidatingCall(
       CE, LCtx, CE->getType(), C.blockCount());
   State = State->BindExpr(CE, LCtx, RetVal);
 
+  const auto *SymRegOfRetVal =
+      dyn_cast_or_null<SymbolicRegion>(RetVal.getAsRegion());
+  if (!SymRegOfRetVal)
+    return;
+
   // Remember to this region.
-  const auto *SymRegOfRetVal = cast<SymbolicRegion>(RetVal.getAsRegion());
   const MemRegion *MR = SymRegOfRetVal->getBaseRegion();
   State = State->set<PreviousCallResultMap>(FD, MR);
 
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 0ac1d91b79beb5..bc14aea27f6736 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -1409,7 +1409,7 @@ CallEventManager::getSimpleCall(const CallExpr *CE, ProgramStateRef State,
   if (const auto *OpCE = dyn_cast<CXXOperatorCallExpr>(CE)) {
     const FunctionDecl *DirectCallee = OpCE->getDirectCallee();
     if (const auto *MD = dyn_cast<CXXMethodDecl>(DirectCallee))
-      if (MD->isInstance())
+      if (MD->isImplicitObjectMemberFunction())
         return create<CXXMemberOperatorCall>(OpCE, State, LCtx, ElemRef);
 
   } else if (CE->getCallee()->getType()->isBlockPointerType()) {
diff --git a/clang/lib/StaticAnalyzer/Core/Environment.cpp b/clang/lib/StaticAnalyzer/Core/Environment.cpp
index 4f989ed59bee38..427f51109853bd 100644
--- a/clang/lib/StaticAnalyzer/Core/Environment.cpp
+++ b/clang/lib/StaticAnalyzer/Core/Environment.cpp
@@ -40,8 +40,11 @@ static const Expr *ignoreTransparentExprs(const Expr *E) {
 
   switch (E->getStmtClass()) {
   case Stmt::OpaqueValueExprClass:
-    E = cast<OpaqueValueExpr>(E)->getSourceExpr();
-    break;
+    if (const Expr *SE = cast<OpaqueValueExpr>(E)->getSourceExpr()) {
+      E = SE;
+      break;
+    }
+    return E;
   case Stmt::ExprWithCleanupsClass:
     E = cast<ExprWithCleanups>(E)->getSubExpr();
     break;
@@ -98,7 +101,6 @@ SVal Environment::getSVal(const EnvironmentEntry &Entry,
   case Stmt::CXXBindTemporaryExprClass:
   case Stmt::ExprWithCleanupsClass:
   case Stmt::GenericSelectionExprClass:
-  case Stmt::OpaqueValueExprClass:
   case Stmt::ConstantExprClass:
   case Stmt::ParenExprClass:
   case Stmt::SubstNonTypeTemplateParmExprClass:
diff --git a/clang/test/AST/ast-crash-doc-function-template.cpp b/clang/test/AST/ast-crash-doc-function-template.cpp
new file mode 100644
index 00000000000000..d48eb0dbe02f01
--- /dev/null
+++ b/clang/test/AST/ast-crash-doc-function-template.cpp
@@ -0,0 +1,30 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -x c++ -Wdocumentation -fsyntax-only -ast-dump-all %t/t.cpp
+
+//--- t.h
+/// MyClass in the header file
+class MyClass {
+public:
+  template <typename T>
+  void Foo() const;
+
+  /// Bar
+  void Bar() const;
+};
+
+//--- t.cpp
+#include "t.h"
+
+/// MyClass::Bar: Foo<int>() is implicitly instantiated and called here.
+void MyClass::Bar() const {
+  Foo<int>();
+}
+
+/// MyClass::Foo
+template <typename T>
+void MyClass::Foo() const {
+}
+
+// CHECK: TranslationUnitDecl
diff --git a/clang/test/AST/ast-dump-override-final.cpp b/clang/test/AST/ast-dump-override-final.cpp
new file mode 100644
index 00000000000000..c1cee6b01565f6
--- /dev/null
+++ b/clang/test/AST/ast-dump-override-final.cpp
@@ -0,0 +1,20 @@
+// This file contain tests to check if override and final are dumped in the
+// correct positions.
+
+// RUN: %clang_cc1 -ast-print -x c++ %s -o - | FileCheck %s
+
+// CHECK: class A {
+class A {
+  // CHECK-NEXT: virtual void f();
+  virtual void f();
+
+  // CHECK-NEXT: virtual void g() final;
+  virtual void g() final;
+} AA;
+
+// CHECK: class B : public A {
+class B : public A {
+  // CHECK-NEXT: virtual void f() override {
+  virtual void f() override {
+  };
+} B;
diff --git a/clang/test/AST/ast-dump-static-operators.cpp b/clang/test/AST/ast-dump-static-operators.cpp
new file mode 100644
index 00000000000000..e8454bdac02f7b
--- /dev/null
+++ b/clang/test/AST/ast-dump-static-operators.cpp
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -std=c++23 %s -ast-dump -triple x86_64-unknown-unknown -o - | FileCheck -strict-whitespace %s
+
+struct Functor {
+  static int operator()(int x, int y) {
+    return x + y;
+  }
+  static int operator[](int x, int y) {
+    return x + y;
+  }
+};
+
+Functor& get_functor() {
+  static Functor functor;
+  return functor;
+}
+
+void call_static_operators() {
+  Functor functor;
+
+  int z1 = functor(1, 2);
+  // CHECK:      CXXOperatorCallExpr {{.*}} 'int' '()'
+  // CHECK-NEXT: |-ImplicitCastExpr {{.*}} <col:19, col:24> 'int (*)(int, int)' <FunctionToPointerDecay>
+  // CHECK-NEXT: | `-DeclRefExpr {{.*}} <col:19, col:24> 'int (int, int)' lvalue CXXMethod {{.*}} 'operator()' 'int (int, int)'
+  // CHECK-NEXT: |-DeclRefExpr {{.*}} <col:12> 'Functor' lvalue Var {{.*}} 'functor' 'Functor'
+  // CHECK-NEXT: |-IntegerLiteral {{.*}} <col:20> 'int' 1
+  // CHECK-NEXT: `-IntegerLiteral {{.*}} <col:23> 'int' 2
+
+  int z2 = functor[1, 2];
+  // CHECK:      CXXOperatorCallExpr {{.*}} 'int' '[]'
+  // CHECK-NEXT: |-ImplicitCastExpr {{.*}} <col:19, col:24> 'int (*)(int, int)' <FunctionToPointerDecay>
+  // CHECK-NEXT: | `-DeclRefExpr {{.*}} <col:19, col:24> 'int (int, int)' lvalue CXXMethod {{.*}} 'operator[]' 'int (int, int)'
+  // CHECK-NEXT: |-DeclRefExpr {{.*}} <col:12> 'Functor' lvalue Var {{.*}} 'functor' 'Functor'
+  // CHECK-NEXT: |-IntegerLiteral {{.*}} <col:20> 'int' 1
+  // CHECK-NEXT: `-IntegerLiteral {{.*}} <col:23> 'int' 2
+
+  int z3 = get_functor()(1, 2);
+  // CHECK:      CXXOperatorCallExpr {{.*}} 'int' '()'
+  // CHECK-NEXT: |-ImplicitCastExpr {{.*}} <col:25, col:30> 'int (*)(int, int)' <FunctionToPointerDecay>
+  // CHECK-NEXT: | `-DeclRefExpr {{.*}} <col:25, col:30> 'int (int, int)' lvalue CXXMethod {{.*}} 'operator()' 'int (int, int)'
+  // CHECK-NEXT: |-CallExpr {{.*}} <col:12, col:24> 'Functor' lvalue
+  // CHECK-NEXT: | `-ImplicitCastExpr {{.*}} <col:12> 'Functor &(*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: |   `-DeclRefExpr {{.*}} <col:12> 'Functor &()' lvalue Function {{.*}} 'get_functor' 'Functor &()'
+  // CHECK-NEXT: |-IntegerLiteral {{.*}} <col:26> 'int' 1
+  // CHECK-NEXT: `-IntegerLiteral {{.*}} <col:29> 'int' 2
+
+  int z4 = get_functor()[1, 2];
+  // CHECK:      CXXOperatorCallExpr {{.*}} 'int' '[]'
+  // CHECK-NEXT: |-ImplicitCastExpr {{.*}} <col:25, col:30> 'int (*)(int, int)' <FunctionToPointerDecay>
+  // CHECK-NEXT: | `-DeclRefExpr {{.*}} <col:25, col:30> 'int (int, int)' lvalue CXXMethod {{.*}} 'operator[]' 'int (int, int)'
+  // CHECK-NEXT: |-CallExpr {{.*}} <col:12, col:24> 'Functor' lvalue
+  // CHECK-NEXT: | `-ImplicitCastExpr {{.*}} <col:12> 'Functor &(*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: |   `-DeclRefExpr {{.*}} <col:12> 'Functor &()' lvalue Function {{.*}} 'get_functor' 'Functor &()'
+  // CHECK-NEXT: |-IntegerLiteral {{.*}} <col:26> 'int' 1
+  // CHECK-NEXT: `-IntegerLiteral {{.*}} <col:29> 'int' 2
+}
diff --git a/clang/test/Analysis/cxx2b-deducing-this.cpp b/clang/test/Analysis/cxx2b-deducing-this.cpp
index d22a897097bec0..2ec9e96bf0f84f 100644
--- a/clang/test/Analysis/cxx2b-deducing-this.cpp
+++ b/clang/test/Analysis/cxx2b-deducing-this.cpp
@@ -60,3 +60,14 @@ void top() {
   s.c();
   s.c(11);
 }
+
+
+struct S2 {
+  bool operator==(this auto, S2) {
+    return true;
+  }
+};
+void use_deducing_this() {
+  int result = S2{} == S2{}; // no-crash
+  clang_analyzer_dump(result); // expected-warning {{1 S32b}}
+}
diff --git a/clang/test/Analysis/invalid-ptr-checker.cpp b/clang/test/Analysis/invalid-ptr-checker.cpp
new file mode 100644
index 00000000000000..58bb45e0fb8421
--- /dev/null
+++ b/clang/test/Analysis/invalid-ptr-checker.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,security.cert.env.InvalidPtr -verify %s
+
+// expected-no-diagnostics
+
+namespace other {
+int strerror(int errnum); // custom strerror
+void no_crash_on_custom_strerror() {
+  (void)strerror(0); // no-crash
+}
+} // namespace other
diff --git a/clang/test/Analysis/templates.cpp b/clang/test/Analysis/templates.cpp
index 061c19fe7e0445..6da1821b70f26f 100644
--- a/clang/test/Analysis/templates.cpp
+++ b/clang/test/Analysis/templates.cpp
@@ -68,3 +68,16 @@ namespace rdar13954714 {
   // force instantiation
   template void blockWithStatic<true>();
 }
+
+namespace structural_value_crash {
+  constexpr char abc[] = "abc";
+
+  template <const char* in>
+  void use_template_param() {
+    const char *p = in;
+  }
+
+  void force_instantiate() {
+    use_template_param<abc>();
+  }
+}
diff --git a/clang/test/CXX/class.derived/class.member.lookup/p11.cpp b/clang/test/CXX/class.derived/class.member.lookup/p11.cpp
index e0899b227e69bd..a42febaca3f041 100644
--- a/clang/test/CXX/class.derived/class.member.lookup/p11.cpp
+++ b/clang/test/CXX/class.derived/class.member.lookup/p11.cpp
@@ -23,3 +23,25 @@ struct D: I1, I2, B2 {
     int D::* mpD = &D::i; // expected-error {{non-static member 'i' found in multiple base-class subobjects of type 'B1'}}
   }
 };
+
+namespace GH80435 {
+struct A {
+  void *data; // expected-note {{member found by ambiguous name lookup}}
+};
+
+class B {
+  void *data; // expected-note {{member found by ambiguous name lookup}}
+};
+
+struct C : A, B {};
+
+decltype(C().data) x; // expected-error {{member 'data' found in multiple base classes of different types}}
+
+struct D { // expected-note {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'C' to 'const D' for 1st argument}}
+           // expected-note@-1{{candidate constructor (the implicit move constructor) not viable: no known conversion from 'C' to 'D' for 1st argument}}
+  template <typename Container, decltype(Container().data) = 0 >
+  D(Container); // expected-note {{candidate template ignored: substitution failure [with Container = C]: member 'data' found in multiple base classes of different types}}
+};
+
+D y(C{}); // expected-error {{no matching constructor for initialization of 'D'}}
+}
diff --git a/clang/test/CXX/drs/dr14xx.cpp b/clang/test/CXX/drs/dr14xx.cpp
index 4c29d03a6e117a..d262f6f9dcab79 100644
--- a/clang/test/CXX/drs/dr14xx.cpp
+++ b/clang/test/CXX/drs/dr14xx.cpp
@@ -488,6 +488,16 @@ namespace dr1467 {  // dr1467: 3.7 c++11
     }
   } // nonaggregate
 
+  namespace SelfInitIsNotListInit {
+    struct S {
+      S();
+      explicit S(S &);
+      S(const S &);
+    };
+    S s1;
+    S s2 = {s1}; // ok, not list-initialization so we pick the non-explicit constructor
+  }
+
   struct NestedInit { int a, b, c; };
   NestedInit ni[1] = {{NestedInit{1, 2, 3}}};
 
diff --git a/clang/test/CXX/drs/dr21xx.cpp b/clang/test/CXX/drs/dr21xx.cpp
index 87040246aa5cd4..a7e50df3f374be 100644
--- a/clang/test/CXX/drs/dr21xx.cpp
+++ b/clang/test/CXX/drs/dr21xx.cpp
@@ -11,16 +11,6 @@
 // cxx98-error@-1 {{variadic macros are a C99 feature}}
 #endif
 
-namespace std {
-  __extension__ typedef __SIZE_TYPE__ size_t;
-
-  template<typename E> struct initializer_list {
-    const E *p; size_t n;
-    initializer_list(const E *p, size_t n);
-    initializer_list();
-  };
-}
-
 namespace dr2100 { // dr2100: 12
   template<const int *P, bool = true> struct X {};
   template<typename T> struct A {
@@ -142,41 +132,6 @@ namespace dr2126 { // dr2126: 12
 #endif
 }
 
-namespace dr2137 { // dr2137: 18
-#if __cplusplus >= 201103L
-  struct Q {
-    Q();
-    Q(Q&&);
-    Q(std::initializer_list<Q>) = delete; // #dr2137-Qcons
-  };
-
-  Q x = Q { Q() };
-  // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}}
-  //   since-cxx11-note@#dr2137-Qcons {{'Q' has been explicitly marked deleted here}}
-
-  int f(Q); // #dr2137-f
-  int y = f({ Q() });
-  // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}}
-  //   since-cxx11-note@#dr2137-Qcons {{'Q' has been explicitly marked deleted here}}
-  //   since-cxx11-note@#dr2137-f {{passing argument to parameter here}}
-
-  struct U {
-    U();
-    U(const U&);
-  };
-
-  struct Derived : U {
-    Derived();
-    Derived(const Derived&);
-  } d;
-
-  int g(Derived);
-  int g(U(&&)[1]) = delete;
-
-  int z = g({ d });
-#endif
-}
-
 namespace dr2140 { // dr2140: 9
 #if __cplusplus >= 201103L
   union U { int a; decltype(nullptr) b; };
diff --git a/clang/test/CXX/drs/dr23xx.cpp b/clang/test/CXX/drs/dr23xx.cpp
index d8556998315c77..03077ae9239a45 100644
--- a/clang/test/CXX/drs/dr23xx.cpp
+++ b/clang/test/CXX/drs/dr23xx.cpp
@@ -6,16 +6,6 @@
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
 // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
 
-namespace std {
-  __extension__ typedef __SIZE_TYPE__ size_t;
-
-  template<typename E> struct initializer_list {
-    const E *p; size_t n;
-    initializer_list(const E *p, size_t n);
-    initializer_list();
-  };
-}
-
 #if __cplusplus >= 201103L
 namespace dr2303 { // dr2303: 12
 template <typename... T>
@@ -57,81 +47,6 @@ void g() {
 } //namespace dr2303
 #endif
 
-namespace dr2311 {  // dr2311: 18 open
-#if __cplusplus >= 201707L
-template<typename T>
-void test() {
-  // Ensure none of these try to call a move constructor.
-  T a = T{T(0)};
-  T b{T(0)};
-  auto c{T(0)};
-  T d = {T(0)};
-  auto e = {T(0)};
-#if __cplusplus >= 202302L
-  auto f = auto{T(0)};
-#endif
-  void(*fn)(T);
-  fn({T(0)});
-}
-
-struct NonMovable {
-  NonMovable(int);
-  NonMovable(NonMovable&&) = delete;
-};
-struct NonMovableNonApplicableIList {
-  NonMovableNonApplicableIList(int);
-  NonMovableNonApplicableIList(NonMovableNonApplicableIList&&) = delete;
-  NonMovableNonApplicableIList(std::initializer_list<int>);
-};
-struct ExplicitMovable {
-  ExplicitMovable(int);
-  explicit ExplicitMovable(ExplicitMovable&&);
-};
-struct ExplicitNonMovable {
-  ExplicitNonMovable(int);
-  explicit ExplicitNonMovable(ExplicitNonMovable&&) = delete;
-};
-struct ExplicitNonMovableNonApplicableIList {
-  ExplicitNonMovableNonApplicableIList(int);
-  explicit ExplicitNonMovableNonApplicableIList(ExplicitNonMovableNonApplicableIList&&) = delete;
-  ExplicitNonMovableNonApplicableIList(std::initializer_list<int>);
-};
-struct CopyOnly {
-  CopyOnly(int);
-  CopyOnly(const CopyOnly&);
-  CopyOnly(CopyOnly&&) = delete;
-};
-struct ExplicitCopyOnly {
-  ExplicitCopyOnly(int);
-  explicit ExplicitCopyOnly(const ExplicitCopyOnly&);
-  explicit ExplicitCopyOnly(ExplicitCopyOnly&&) = delete;
-};
-
-template void test<NonMovable>();
-template void test<NonMovableNonApplicableIList>();
-template void test<ExplicitMovable>();
-template void test<ExplicitNonMovable>();
-template void test<ExplicitNonMovableNonApplicableIList>();
-template void test<CopyOnly>();
-template void test<ExplicitCopyOnly>();
-
-struct any {
-    template<typename T>
-    any(T&&);
-};
-
-template<typename T>
-struct X {
-    X();
-    X(T) = delete; // #dr2311-X
-};
-
-X<std::initializer_list<any>> x{ X<std::initializer_list<any>>() };
-// since-cxx17-error@-1 {{call to deleted constructor of 'X<std::initializer_list<any>>'}}
-//   since-cxx17-note@#dr2311-X {{'X' has been explicitly marked deleted here}}
-#endif
-}
-
 // dr2331: na
 // dr2335 is in dr2335.cxx
 
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la32.c b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
index 93d54f511a9cd2..eb3f8cbe7ac4cc 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la32.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
@@ -169,8 +169,8 @@ unsigned int cpucfg(unsigned int a) {
 
 // LA32-LABEL: @rdtime(
 // LA32-NEXT:  entry:
-// LA32-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc !2
-// LA32-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !3
+// LA32-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// LA32-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META3:![0-9]+]]
 // LA32-NEXT:    ret void
 //
 void rdtime() {
@@ -201,13 +201,28 @@ void loongarch_movgr2fcsr(int a) {
   __builtin_loongarch_movgr2fcsr(1, a);
 }
 
-// CHECK-LABEL: @cacop_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A:%.*]], i32 1024)
-// CHECK-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A]], i32 1024)
-// CHECK-NEXT:    ret void
+// LA32-LABEL: @cacop_w(
+// LA32-NEXT:  entry:
+// LA32-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A:%.*]], i32 1024)
+// LA32-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A]], i32 1024)
+// LA32-NEXT:    ret void
 //
 void cacop_w(unsigned long int a) {
   __cacop_w(1, a, 1024);
   __builtin_loongarch_cacop_w(1, a, 1024);
 }
+
+// LA32-LABEL: @iocsrrd_h_result(
+// LA32-NEXT:  entry:
+// LA32-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// LA32-NEXT:    [[CONV_I:%.*]] = trunc i32 [[TMP0]] to i16
+// LA32-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
+// LA32-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// LA32-NEXT:    [[CONV3:%.*]] = add i16 [[TMP2]], [[CONV_I]]
+// LA32-NEXT:    ret i16 [[CONV3]]
+//
+unsigned short iocsrrd_h_result(unsigned int a) {
+  unsigned short b = __iocsrrd_h(a);
+  unsigned short c = __builtin_loongarch_iocsrrd_h(a);
+  return b+c;
+}
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64.c b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
index a740882eef5411..50ec358f546ec0 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la64.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
@@ -387,7 +387,7 @@ unsigned int cpucfg(unsigned int a) {
 
 // CHECK-LABEL: @rdtime_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i64, i64 } asm sideeffect "rdtime.d $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc !2
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i64, i64 } asm sideeffect "rdtime.d $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void rdtime_d() {
@@ -396,8 +396,8 @@ void rdtime_d() {
 
 // CHECK-LABEL: @rdtime(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !3
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META3:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META4:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void rdtime() {
@@ -427,3 +427,18 @@ void loongarch_movgr2fcsr(int a) {
   __movgr2fcsr(1, a);
   __builtin_loongarch_movgr2fcsr(1, a);
 }
+
+// CHECK-LABEL: @iocsrrd_h_result(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    [[CONV3:%.*]] = add i16 [[TMP2]], [[CONV_I]]
+// CHECK-NEXT:    ret i16 [[CONV3]]
+//
+unsigned short iocsrrd_h_result(unsigned int a) {
+  unsigned short b = __iocsrrd_h(a);
+  unsigned short c = __builtin_loongarch_iocsrrd_h(a);
+  return b+c;
+}
diff --git a/clang/test/CodeGen/Mips/inline-asm-constraints.c b/clang/test/CodeGen/Mips/inline-asm-constraints.c
new file mode 100644
index 00000000000000..88afe8735083b4
--- /dev/null
+++ b/clang/test/CodeGen/Mips/inline-asm-constraints.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -emit-llvm -triple mips -target-feature +soft-float %s -o - | FileCheck %s --check-prefix=SOFT_FLOAT
+
+// SOFT_FLOAT: call void asm sideeffect "", "r,~{$1}"(float %1)
+void read_float(float *p) {
+  __asm__("" ::"r"(*p));
+}
+
+// SOFT_FLOAT: call void asm sideeffect "", "r,~{$1}"(double %1)
+void read_double(double *p) {
+  __asm__("" :: "r"(*p));
+}
diff --git a/clang/test/CodeGen/RISCV/riscv-func-attr-target.c b/clang/test/CodeGen/RISCV/riscv-func-attr-target.c
index 7d3362e84e7588..f216eaf735b4a8 100644
--- a/clang/test/CodeGen/RISCV/riscv-func-attr-target.c
+++ b/clang/test/CodeGen/RISCV/riscv-func-attr-target.c
@@ -39,7 +39,7 @@ __attribute__((target("cpu=sifive-u54"))) void testAttrCpuOnly() {}
 // CHECK: attributes #0 = { {{.*}}"target-features"="+64bit,+a,+m,+save-restore,+zifencei,-relax,-zbb,-zfa" }
 // CHECK: attributes #1 = { {{.*}}"target-cpu"="rocket-rv64" "target-features"="+64bit,+a,+d,+f,+m,+save-restore,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-relax,-zbb,-zfa" "tune-cpu"="generic-rv64" }
 // CHECK: attributes #2 = { {{.*}}"target-features"="+64bit,+a,+m,+save-restore,+zbb,+zifencei,-relax,-zfa" }
-// CHECK: attributes #3 = { {{.*}}"target-features"="+64bit,+a,+d,+experimental-zicond,+f,+m,+save-restore,+v,+zbb,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-relax,-zfa" }
+// CHECK: attributes #3 = { {{.*}}"target-features"="+64bit,+a,+d,+f,+m,+save-restore,+v,+zbb,+zicond,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-relax,-zfa" }
 // Make sure we append negative features if we override the arch
 // CHECK: attributes #4 = { {{.*}}"target-features"="+64bit,+a,+c,+d,+f,+m,+save-restore,+zbb,+zicsr,+zifencei,{{(-[[:alnum:]-]+)(,-[[:alnum:]-]+)*}}" }
 // CHECK: attributes #5 = { {{.*}}"target-features"="+64bit,+m,+save-restore,{{(-[[:alnum:]-]+)(,-[[:alnum:]-]+)*}}" }
diff --git a/clang/test/CodeGen/RISCV/tls-dialect.c b/clang/test/CodeGen/RISCV/tls-dialect.c
new file mode 100644
index 00000000000000..e624a8b3fe4e67
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/tls-dialect.c
@@ -0,0 +1,14 @@
+// REQUIRES: riscv-registered-target
+/// cc1 -enable-tlsdesc (due to -mtls-dialect=desc) enables TLSDESC.
+// RUN: %clang_cc1 -triple riscv64 -S -mrelocation-model pic -pic-level 1 -enable-tlsdesc %s -o - | FileCheck %s --check-prefix=DESC
+// RUN: %clang_cc1 -triple riscv64 -S -mrelocation-model pic -pic-level 1 %s -o - | FileCheck %s --check-prefix=NODESC
+
+__thread int x;
+
+// DESC:       %tlsdesc_hi
+// DESC-NOT:   %tls_gd_pcrel_hi
+// NODESC:     %tls_gd_pcrel_hi
+// NODESC-NOT: %tlsdesc_hi
+int use() {
+  return x;
+}
diff --git a/clang/test/CodeGen/aapcs-align.cpp b/clang/test/CodeGen/aapcs-align.cpp
index 2886a32974b066..4f393d9e6b7f32 100644
--- a/clang/test/CodeGen/aapcs-align.cpp
+++ b/clang/test/CodeGen/aapcs-align.cpp
@@ -134,8 +134,8 @@ void g6() {
   f6m(1, 2, 3, 4, 5, s);
 }
 // CHECK: define{{.*}} void @g6
-// CHECK: call void @f6(i32 noundef 1, [4 x i32] [i32 6, i32 7, i32 0, i32 0])
-// CHECK: call void @f6m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [4 x i32] [i32 6, i32 7, i32 0, i32 0])
+// CHECK: call void @f6(i32 noundef 1, [4 x i32] [i32 6, i32 7, i32 0, i32 undef])
+// CHECK: call void @f6m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [4 x i32] [i32 6, i32 7, i32 0, i32 undef])
 // CHECK: declare void @f6(i32 noundef, [4 x i32])
 // CHECK: declare void @f6m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [4 x i32])
 }
diff --git a/clang/test/CodeGen/aapcs64-align.cpp b/clang/test/CodeGen/aapcs64-align.cpp
index 759413cbc4b56f..de231f2123b975 100644
--- a/clang/test/CodeGen/aapcs64-align.cpp
+++ b/clang/test/CodeGen/aapcs64-align.cpp
@@ -75,8 +75,8 @@ void g4() {
   f4m(1, 2, 3, 4, 5, s);
 }
 // CHECK: define{{.*}} void @g4()
-// CHECK: call void @f4(i32 noundef 1, [2 x i64] %{{.*}})
-// CHECK: void @f4m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] %{{.*}})
+// CHECK: call void @f4(i32 noundef 1, [2 x i64] [i64 30064771078, i64 0])
+// CHECK: void @f4m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] [i64 30064771078, i64 0])
 // CHECK: declare void @f4(i32 noundef, [2 x i64])
 // CHECK: declare void @f4m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64])
 
@@ -95,8 +95,8 @@ void f5m(int, int, int, int, int, P16);
     f5m(1, 2, 3, 4, 5, s);
 }
 // CHECK: define{{.*}} void @g5()
-// CHECK: call void @f5(i32 noundef 1, [2 x i64] %{{.*}})
-// CHECK: void @f5m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] %{{.*}})
+// CHECK: call void @f5(i32 noundef 1, [2 x i64] [i64 30064771078, i64 0])
+// CHECK: void @f5m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] [i64 30064771078, i64 0])
 // CHECK: declare void @f5(i32 noundef, [2 x i64])
 // CHECK: declare void @f5m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64])
 
diff --git a/clang/test/CodeGen/aarch64-inline-asm.c b/clang/test/CodeGen/aarch64-inline-asm.c
index 75e9a8c46b8769..8ddee560b11da4 100644
--- a/clang/test/CodeGen/aarch64-inline-asm.c
+++ b/clang/test/CodeGen/aarch64-inline-asm.c
@@ -95,3 +95,11 @@ void test_reduced_gpr_constraints(int var32, long var64) {
 // CHECK: [[ARG2:%.+]] = load i64, ptr
 // CHECK: call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i64 [[ARG2]])
 }
+
+void test_sme_constraints(){
+  asm("movt zt0[3, mul vl], z0" : : : "za");
+// CHECK: call void asm sideeffect "movt zt0[3, mul vl], z0", "~{za}"()
+
+  asm("movt zt0[3, mul vl], z0" : : : "zt0");
+// CHECK: call void asm sideeffect "movt zt0[3, mul vl], z0", "~{zt0}"()
+}
\ No newline at end of file
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
index 695e0afa3d0de7..a333d85818d2fa 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
index a3c3d8bf13db6f..7617dcef7ea973 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
index 2c2f100ac7f8cc..5fa4c35ed770ff 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
index 0502073097d541..b26e32e5ff8332 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
index 60feebced32d2e..02d4d034befb78 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
index b0c1dd904284f5..c2c89aee03b5e6 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
index 3dcc2c70d3cfcd..e036cb45feff36 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
index 06a6a19ca3f7a6..84338597cdb30b 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
index 69641df8a80fb1..7b1a8b0a0201ff 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
index d726855c9743fb..3d2a4e4d2b38e0 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
index ed1e70a3a469f5..9e44d1c9253466 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
index f93f3e5138b24d..d8e4b853308d0f 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
index b4bc041a21d588..467cf9fd092a04 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
index 2ad7e364257235..e58021bf8bf456 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
index a00a48ceafccb7..483e813275028f 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
index 7f56941108828a..e62f7b92821e42 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
index cfc2ee0f77be69..2cc99d4fb88d5f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
index 720254c8421203..1ff7a7fedf1bf3 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
index a5dec263bff2c2..257cb595250181 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
@@ -1,15 +1,15 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \
-// RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:  -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:  -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:  -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:  -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:  -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index a8d881b1e2ed3d..79a11c2ec153e4 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c
index f03e998131df51..2b2b2e5c0f411d 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
index b90358112e47f1..8d1e358176c30f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
index 7c210fbe6923e9..70c31a4a87e7e9 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
index d7ef75ce01dd70..5bc9c9088517e8 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
index f65c0ef61f8187..82c004e3105a43 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
index cbab1cc8e81bd4..e8706f9576915f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
index f7f16281ff4061..99feafbd682acb 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
index 3fedfdc3308932..0f0c33e48bd97b 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
index 4b9b4363ec6296..a4e2616784efa7 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
index 5e499573304c81..3e554212cb70be 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
index 26c175c76532b7..a438fd395219bb 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
index 8c5e7eb4019919..b0cbdc748dc80c 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
index b552bbb66a259e..5cc0e0e1d36e1a 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
index cd87ee00999500..5fd4b04056526e 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
index 6847eb99af2df3..b86cb19c01e308 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
@@ -2,10 +2,10 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
index c7da703ddb2718..7af8c589994fbb 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
index 6f20b37f87897d..5937a288dd8468 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
@@ -2,12 +2,12 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
index 781b699c882bb7..f54c09d5ef2c37 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
@@ -2,12 +2,12 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
index 2b24f65efc2663..2fb5d3bea27c23 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
index 32f9bc3ab88086..eee927acc22e34 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
index 8735016ae728cf..6308d6c596f164 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
index 32eb9a4b40c405..c2ecbf93bfaa78 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
index 6e9d5cf4492f54..784c24c8e7cb68 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
index f4dadef8fa5789..6349cec771199e 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
index ff7f0c501f5507..3d56948e25f73b 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
index 3f13b3a0db73d9..4cc1f3af32ec05 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
index b8408ea31ed0b4..cc356600ab5333 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
index 33733356f3078f..069bf13ff8d281 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sve -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c
index 02da18264da0a3..1a3a84a73dbad1 100644
--- a/clang/test/CodeGen/aarch64-targetattr.c
+++ b/clang/test/CodeGen/aarch64-targetattr.c
@@ -97,19 +97,19 @@ void minusarch() {}
 // CHECK: attributes #0 = { {{.*}} "target-features"="+crc,+fp-armv8,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #1 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #2 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" }
-// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" }
+// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" }
 // CHECK: attributes #4 = { {{.*}} "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm" }
 // CHECK: attributes #5 = { {{.*}} "tune-cpu"="cortex-a710" }
 // CHECK: attributes #6 = { {{.*}} "target-cpu"="generic" }
 // CHECK: attributes #7 = { {{.*}} "tune-cpu"="generic" }
 // CHECK: attributes #8 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs" "tune-cpu"="cortex-a710" }
 // CHECK: attributes #9 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2" }
-// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,-sve" }
+// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2" }
+// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,-sve" }
 // CHECK: attributes #12 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
 // CHECK: attributes #13 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-sve2" }
 // CHECK: attributes #14 = { {{.*}} "target-features"="+fullfp16" }
-// CHECK: attributes #15 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #16 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
+// CHECK: attributes #15 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
+// CHECK: attributes #16 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
 // CHECK: attributes #17 = { {{.*}} "target-features"="-neon" }
 // CHECK: attributes #18 = { {{.*}} "target-features"="-v9.3a" }
diff --git a/clang/test/CodeGen/arm64-microsoft-arguments.cpp b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
index e8309888dcfe21..85472645acb3b3 100644
--- a/clang/test/CodeGen/arm64-microsoft-arguments.cpp
+++ b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
@@ -201,3 +201,18 @@ S11 f11() {
   S11 x;
   return func11(x);
 }
+
+// GH86384
+// Pass and return object with template constructor (pass directly,
+// return indirectly).
+// CHECK: define dso_local void @"?f12@@YA?AUS12@@XZ"(ptr dead_on_unwind inreg noalias writable sret(%struct.S12) align 4 {{.*}})
+// CHECK: call void @"?func12@@YA?AUS12@@U1@@Z"(ptr dead_on_unwind inreg writable sret(%struct.S12) align 4 {{.*}}, i64 {{.*}})
+struct S12 {
+  template<typename T> S12(T*) {}
+  int x;
+};
+S12 func12(S12 x);
+S12 f12() {
+  S12 x((int*)0);
+  return func12(x);
+}
diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.c b/clang/test/CodeGen/attr-counted-by-pr88931.c
new file mode 100644
index 00000000000000..520ebd09973284
--- /dev/null
+++ b/clang/test/CodeGen/attr-counted-by-pr88931.c
@@ -0,0 +1,40 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wno-missing-declarations -emit-llvm -o - %s | FileCheck %s
+
+struct foo {
+  int x,y,z;
+  struct bar {
+    int count;
+    int array[] __attribute__((counted_by(count)));
+  };
+};
+
+void init(void * __attribute__((pass_dynamic_object_size(0))));
+
+// CHECK-LABEL: define dso_local void @test1(
+// CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_BAR:%.*]], ptr [[P]], i64 0, i32 1
+// CHECK-NEXT:    tail call void @init(ptr noundef nonnull [[ARRAY]], i64 noundef -1) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+void test1(struct bar *p) {
+  init(p->array);
+}
+
+struct mux {
+  int count;
+  int array[] __attribute__((counted_by(count)));
+};
+
+struct bux { struct mux x; };
+
+// CHECK-LABEL: define dso_local void @test2(
+// CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @init(ptr noundef [[P]], i64 noundef -1) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+void test2(struct bux *p) {
+  init(p);
+}
diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.cpp b/clang/test/CodeGen/attr-counted-by-pr88931.cpp
new file mode 100644
index 00000000000000..2a8cc1d07e50d9
--- /dev/null
+++ b/clang/test/CodeGen/attr-counted-by-pr88931.cpp
@@ -0,0 +1,21 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wall -emit-llvm -o - %s | FileCheck %s
+
+struct foo {
+  struct bar {
+    int array[];
+    bar();
+  };
+};
+
+void init(void * __attribute__((pass_dynamic_object_size(0))));
+
+// CHECK-LABEL: define dso_local void @_ZN3foo3barC1Ev(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(1) [[THIS:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+foo::bar::bar() {
+  init(array);
+}
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index 74d5457e398b91..118e6b889e2672 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -1288,16 +1288,10 @@ int test14(int idx) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test15(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[FOO:%.*]] = alloca [[STRUCT_ANON_8:%.*]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR12]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 1, ptr [[FOO]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 2, ptr [[TMP0]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANON_8]], ptr [[FOO]], i64 0, i32 2, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR12]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP1]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds ([[STRUCT_ANON_8:%.*]], ptr @__const.test15.foo, i64 1, i32 0), i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test15(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -1315,16 +1309,10 @@ int test14(int idx) {
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test15(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[FOO:%.*]] = alloca [[STRUCT_ANON_8:%.*]], align 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR9]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 1, ptr [[FOO]], align 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 2, ptr [[TMP0]], align 4
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANON_8]], ptr [[FOO]], i64 0, i32 2, i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR9]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP1]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds ([[STRUCT_ANON_8:%.*]], ptr @__const.test15.foo, i64 1, i32 0), i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 int test15(int idx) {
   struct {
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
index 886af083f1c009..b591249bbef1bc 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
@@ -18,8 +18,29 @@ typedef __rvv_uint64m1_t vuint64m1_t;
 typedef __rvv_float32m1_t vfloat32m1_t;
 typedef __rvv_float64m1_t vfloat64m1_t;
 
+typedef __rvv_bool1_t vbool1_t;
+typedef __rvv_bool2_t vbool2_t;
+typedef __rvv_bool4_t vbool4_t;
+typedef __rvv_bool8_t vbool8_t;
+typedef __rvv_bool16_t vbool16_t;
+typedef __rvv_bool32_t vbool32_t;
+typedef __rvv_bool64_t vbool64_t;
+
 typedef vint64m1_t fixed_int64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
 typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool2_t fixed_bool2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 2)));
+typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 4)));
+typedef vbool8_t fixed_bool8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 8)));
+#if __riscv_v_fixed_vlen >= 128
+typedef vbool16_t fixed_bool16_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 16)));
+#endif
+#if __riscv_v_fixed_vlen >= 256
+typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 32)));
+#endif
+#if __riscv_v_fixed_vlen >= 512
+typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 64)));
+#endif
 
 #define DEFINE_STRUCT(ty)   \
   struct struct_##ty {      \
@@ -28,6 +49,19 @@ typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__ri
 
 DEFINE_STRUCT(int64m1)
 DEFINE_STRUCT(float64m1)
+DEFINE_STRUCT(bool1)
+DEFINE_STRUCT(bool2)
+DEFINE_STRUCT(bool4)
+DEFINE_STRUCT(bool8)
+#if __riscv_v_fixed_vlen >= 128
+DEFINE_STRUCT(bool16)
+#endif
+#if __riscv_v_fixed_vlen >= 256
+DEFINE_STRUCT(bool32)
+#endif
+#if __riscv_v_fixed_vlen >= 512
+DEFINE_STRUCT(bool64)
+#endif
 
 //===----------------------------------------------------------------------===//
 // int64
@@ -136,3 +170,69 @@ vfloat64m1_t read_float64m1(struct struct_float64m1 *s) {
 void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
   s->y[0] = x;
 }
+
+//===----------------------------------------------------------------------===//
+// bool
+//===----------------------------------------------------------------------===//
+
+// CHECK-64-LABEL: @read_bool1(
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
+//
+// CHECK-128-LABEL: @read_bool1(
+// CHECK-128-NEXT:  entry:
+// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <16 x i8>, align 16
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    store <16 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
+//
+// CHECK-256-LABEL: @read_bool1(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
+//
+vbool1_t read_bool1(struct struct_bool1 *s) {
+  return s->y[0];
+}
+
+// CHECK-64-LABEL: @write_bool1(
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-64-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    ret void
+//
+// CHECK-128-LABEL: @write_bool1(
+// CHECK-128-NEXT:  entry:
+// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 16
+// CHECK-128-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    store <16 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    ret void
+//
+// CHECK-256-LABEL: @write_bool1(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-256-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    ret void
+//
+void write_bool1(struct struct_bool1 *s, vbool1_t x) {
+  s->y[0] = x;
+}
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
index 70e1aefe7aaffb..888abe1a7bc3fb 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
@@ -7,6 +7,8 @@
 
 typedef vint32m1_t fixed_int32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
 typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4)));
 
 //===----------------------------------------------------------------------===//
 // Test caller/callee with VLST <-> VLAT
@@ -66,6 +68,30 @@ fixed_float64m1_t call_float64_ff(fixed_float64m1_t op1, fixed_float64m1_t op2)
   return __riscv_vfadd(op1, op2, __riscv_v_fixed_vlen/64);
 }
 
+// CHECK-LABEL: @call_bool1_ff(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1_COERCE:%.*]], <vscale x 64 x i1> [[OP2_COERCE:%.*]], i64 256)
+// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+//
+fixed_bool1_t call_bool1_ff(fixed_bool1_t op1, fixed_bool1_t op2) {
+  return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
+}
+
+// CHECK-LABEL: @call_bool4_ff(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.riscv.vmand.nxv16i1.i64(<vscale x 16 x i1> [[TMP0:%.*]], <vscale x 16 x i1> [[TMP1:%.*]], i64 64)
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+//
+fixed_bool4_t call_bool4_ff(fixed_bool4_t op1, fixed_bool4_t op2) {
+  return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 4);
+}
+
 //===----------------------------------------------------------------------===//
 // fixed, scalable
 //===----------------------------------------------------------------------===//
@@ -88,6 +114,30 @@ fixed_float64m1_t call_float64_fs(fixed_float64m1_t op1, vfloat64m1_t op2) {
   return __riscv_vfadd(op1, op2, __riscv_v_fixed_vlen/64);
 }
 
+// CHECK-LABEL: @call_bool1_fs(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1_COERCE:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
+// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+//
+fixed_bool1_t call_bool1_fs(fixed_bool1_t op1, vbool1_t op2) {
+  return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
+}
+
+// CHECK-LABEL: @call_bool4_fs(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.riscv.vmand.nxv16i1.i64(<vscale x 16 x i1> [[TMP0:%.*]], <vscale x 16 x i1> [[OP2:%.*]], i64 64)
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+fixed_bool4_t call_bool4_fs(fixed_bool4_t op1, vbool4_t op2) {
+  return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 4);
+}
+
 //===----------------------------------------------------------------------===//
 // scalable, scalable
 //===----------------------------------------------------------------------===//
@@ -109,3 +159,27 @@ fixed_int32m1_t call_int32_ss(vint32m1_t op1, vint32m1_t op2) {
 fixed_float64m1_t call_float64_ss(vfloat64m1_t op1, vfloat64m1_t op2) {
   return __riscv_vfadd(op1, op2, __riscv_v_fixed_vlen/64);
 }
+
+// CHECK-LABEL: @call_bool1_ss(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
+// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+//
+fixed_bool1_t call_bool1_ss(vbool1_t op1, vbool1_t op2) {
+  return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
+}
+
+// CHECK-LABEL: @call_bool4_ss(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.riscv.vmand.nxv16i1.i64(<vscale x 16 x i1> [[OP1:%.*]], <vscale x 16 x i1> [[OP2:%.*]], i64 64)
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+fixed_bool4_t call_bool4_ss(vbool4_t op1, vbool4_t op2) {
+  return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 4);
+}
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
index 93e9a4eee96eb8..fe278174bf6817 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
@@ -16,6 +16,10 @@ typedef __rvv_uint64m1_t vuint64m1_t;
 typedef __rvv_float32m1_t vfloat32m1_t;
 typedef __rvv_float64m1_t vfloat64m1_t;
 
+typedef __rvv_bool1_t vbool1_t;
+typedef __rvv_bool4_t vbool4_t;
+typedef __rvv_bool32_t vbool32_t;
+
 typedef vint64m1_t fixed_int64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
 typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
 
@@ -23,6 +27,10 @@ typedef vint32m1_t fixed_int32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_
 typedef vfloat64m1_t fixed_float64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
 typedef int32_t gnu_int32m1_t __attribute__((vector_size(__riscv_v_fixed_vlen / 8)));
 
+typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4)));
+typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/32)));
+
 // CHECK-LABEL: @to_vint32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[TYPE_COERCE:%.*]]
@@ -55,9 +63,69 @@ fixed_float64m1_t from_vfloat64m1_t(vfloat64m1_t type) {
   return type;
 }
 
+// CHECK-LABEL: @from_vbool1_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    store <vscale x 64 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
+//
+fixed_bool1_t from_vbool1_t(vbool1_t type) {
+  return type;
+}
+
+// CHECK-LABEL: @to_vbool1_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TYPE_COERCE:%.*]]
+//
+vbool1_t to_vbool1_t(fixed_bool1_t type) {
+  return type;
+}
+
+// CHECK-LABEL: @from_vbool4_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TYPE:%.*]]
+//
+fixed_bool4_t from_vbool4_t(vbool4_t type) {
+  return type;
+}
+
+// CHECK-LABEL: @to_vbool4_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0:%.*]]
+//
+vbool4_t to_vbool4_t(fixed_bool4_t type) {
+  return type;
+}
+
+// CHECK-LABEL: @from_vbool32_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
+// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
+// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+//
+fixed_bool32_t from_vbool32_t(vbool32_t type) {
+  return type;
+}
+
+// CHECK-LABEL: @to_vbool32_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TYPE_COERCE:%.*]]
+//
+vbool32_t to_vbool32_t(fixed_bool32_t type) {
+  return type;
+}
+
 // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -68,7 +136,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
@@ -77,7 +145,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
 
 // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -88,7 +156,7 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) {
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
index 959a6c9bf96888..ac22bdce0da3e5 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
@@ -27,11 +27,117 @@ typedef __rvv_uint64m2_t vuint64m2_t;
 typedef __rvv_float32m2_t vfloat32m2_t;
 typedef __rvv_float64m2_t vfloat64m2_t;
 
+typedef __rvv_bool1_t vbool1_t;
+typedef __rvv_bool4_t vbool4_t;
+typedef __rvv_bool32_t vbool32_t;
+
 typedef vint32m1_t fixed_int32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
 typedef vint32m2_t fixed_int32m2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 2)));
+typedef vint16m4_t fixed_int16m4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 4)));
+typedef vint8m8_t fixed_int8m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 8)));
+typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4)));
+typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/32)));
 
 fixed_int32m1_t global_vec;
 fixed_int32m2_t global_vec_m2;
+fixed_int8m8_t global_vec_int8m8;
+fixed_int16m4_t global_vec_int16m4;
+fixed_bool1_t global_bool1;
+fixed_bool4_t global_bool4;
+fixed_bool32_t global_bool32;
+
+// CHECK-LABEL: @test_bool1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <256 x i8>, align 8
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <vscale x 64 x i1>, align 1
+// CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 64 x i8>, align 1
+// CHECK-NEXT:    [[MASK:%.*]] = alloca <vscale x 64 x i1>, align 1
+// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
+// CHECK-NEXT:    store <vscale x 64 x i1> [[M:%.*]], ptr [[M_ADDR]], align 1
+// CHECK-NEXT:    store <vscale x 64 x i8> [[VEC:%.*]], ptr [[VEC_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 64 x i1>, ptr [[M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @global_bool1, align 8
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[TMP2]], i64 256)
+// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP3]], ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 64 x i1>, ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 64 x i8>, ptr [[VEC_ADDR]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <256 x i8>, ptr @global_vec_int8m8, align 8
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP6]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadd.mask.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[TMP5]], <vscale x 64 x i8> [[CAST_SCALABLE]], <vscale x 64 x i1> [[TMP4]], i64 256, i64 3)
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <256 x i8> @llvm.vector.extract.v256i8.nxv64i8(<vscale x 64 x i8> [[TMP7]], i64 0)
+// CHECK-NEXT:    store <256 x i8> [[CAST_FIXED]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load <256 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP8]], i64 0)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[CAST_SCALABLE1]]
+//
+fixed_int8m8_t test_bool1(vbool1_t m, vint8m8_t vec) {
+  vbool1_t mask = __riscv_vmand(m, global_bool1, __riscv_v_fixed_vlen);
+  return __riscv_vadd(mask, vec, global_vec_int8m8, __riscv_v_fixed_vlen);
+}
+
+// CHECK-LABEL: @test_bool4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <64 x i16>, align 8
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <vscale x 16 x i1>, align 1
+// CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 16 x i16>, align 2
+// CHECK-NEXT:    [[MASK:%.*]] = alloca <vscale x 16 x i1>, align 1
+// CHECK-NEXT:    store <vscale x 16 x i1> [[M:%.*]], ptr [[M_ADDR]], align 1
+// CHECK-NEXT:    store <vscale x 16 x i16> [[VEC:%.*]], ptr [[VEC_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 16 x i1>, ptr [[M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @global_bool4, align 8
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmand.nxv16i1.i64(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i1> [[TMP2]], i64 64)
+// CHECK-NEXT:    store <vscale x 16 x i1> [[TMP3]], ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 16 x i1>, ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 16 x i16>, ptr [[VEC_ADDR]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load <64 x i16>, ptr @global_vec_int16m4, align 8
+// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.v64i16(<vscale x 16 x i16> undef, <64 x i16> [[TMP6]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadd.mask.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i16> [[TMP5]], <vscale x 16 x i16> [[CAST_SCALABLE1]], <vscale x 16 x i1> [[TMP4]], i64 64, i64 3)
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <64 x i16> @llvm.vector.extract.v64i16.nxv16i16(<vscale x 16 x i16> [[TMP7]], i64 0)
+// CHECK-NEXT:    store <64 x i16> [[CAST_FIXED]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load <64 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[CAST_SCALABLE2:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.v64i16(<vscale x 16 x i16> undef, <64 x i16> [[TMP8]], i64 0)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[CAST_SCALABLE2]]
+//
+fixed_int16m4_t test_bool4(vbool4_t m, vint16m4_t vec) {
+  vbool4_t mask = __riscv_vmand(m, global_bool4, __riscv_v_fixed_vlen/4);
+  return __riscv_vadd(mask, vec, global_vec_int16m4, __riscv_v_fixed_vlen/4);
+}
+
+// CHECK-LABEL: @test_bool32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <8 x i32>, align 8
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <vscale x 2 x i1>, align 1
+// CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 2 x i32>, align 4
+// CHECK-NEXT:    [[MASK:%.*]] = alloca <vscale x 2 x i1>, align 1
+// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
+// CHECK-NEXT:    store <vscale x 2 x i1> [[M:%.*]], ptr [[M_ADDR]], align 1
+// CHECK-NEXT:    store <vscale x 2 x i32> [[VEC:%.*]], ptr [[VEC_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i1>, ptr [[M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr @global_bool32, align 1
+// CHECK-NEXT:    store <1 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP2]], i64 8)
+// CHECK-NEXT:    store <vscale x 2 x i1> [[TMP3]], ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 2 x i1>, ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 2 x i32>, ptr [[VEC_ADDR]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @global_vec, align 8
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TMP6]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadd.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> [[TMP5]], <vscale x 2 x i32> [[CAST_SCALABLE]], <vscale x 2 x i1> [[TMP4]], i64 8, i64 3)
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TMP7]], i64 0)
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TMP8]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE1]]
+//
+fixed_int32m1_t test_bool32(vbool32_t m, vint32m1_t vec) {
+  vbool32_t mask = __riscv_vmand(m, global_bool32, __riscv_v_fixed_vlen/32);
+  return __riscv_vadd(mask, vec, global_vec, __riscv_v_fixed_vlen/32);
+}
 
 // CHECK-LABEL: @test_ptr_to_global(
 // CHECK-NEXT:  entry:
@@ -70,6 +176,72 @@ fixed_int32m1_t array_arg(fixed_int32m1_t arr[]) {
   return arr[0];
 }
 
+// CHECK-LABEL: @address_of_array_idx_bool1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <32 x i8>, align 8
+// CHECK-NEXT:    [[ARR:%.*]] = alloca [3 x <32 x i8>], align 8
+// CHECK-NEXT:    [[PARR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <32 x i8>], ptr [[ARR]], i64 0, i64 0
+// CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[PARR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 8
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 8 [[RETVAL]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+//
+fixed_bool1_t address_of_array_idx_bool1() {
+  fixed_bool1_t arr[3];
+  fixed_bool1_t *parr;
+  parr = &arr[0];
+  return *parr;
+}
+
+// CHECK-LABEL: @address_of_array_idx_bool4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[ARR:%.*]] = alloca [3 x <8 x i8>], align 8
+// CHECK-NEXT:    [[PARR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[ARR]], i64 0, i64 0
+// CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[PARR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[TMP0]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP1]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP3]]
+//
+fixed_bool4_t address_of_array_idx_bool4() {
+  fixed_bool4_t arr[3];
+  fixed_bool4_t *parr;
+  parr = &arr[0];
+  return *parr;
+}
+
+// CHECK-LABEL: @address_of_array_idx_bool32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <1 x i8>, align 1
+// CHECK-NEXT:    [[ARR:%.*]] = alloca [3 x <1 x i8>], align 1
+// CHECK-NEXT:    [[PARR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i8>], ptr [[ARR]], i64 0, i64 0
+// CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[PARR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[TMP0]], align 1
+// CHECK-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL_COERCE]], ptr align 1 [[RETVAL]], i64 1, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+//
+fixed_bool32_t address_of_array_idx_bool32() {
+  fixed_bool32_t arr[3];
+  fixed_bool32_t *parr;
+  parr = &arr[0];
+  return *parr;
+}
+
 // CHECK-LABEL: @test_cast(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca <8 x i32>, align 8
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
index 8bdcd9af20efca..d7df1a24bbfb00 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
@@ -17,10 +17,25 @@ typedef __rvv_uint64m1_t vuint64m1_t;
 typedef __rvv_float32m1_t vfloat32m1_t;
 typedef __rvv_float64m1_t vfloat64m1_t;
 
+typedef __rvv_bool1_t vbool1_t;
+typedef __rvv_bool4_t vbool4_t;
+typedef __rvv_bool32_t vbool32_t;
+
 typedef vint64m1_t fixed_int64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4)));
+#if __riscv_v_fixed_vlen >= 256
+typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/32)));
+#endif
 
 fixed_int64m1_t global_i64;
 
+fixed_bool1_t global_bool1;
+fixed_bool4_t global_bool4;
+#if __riscv_v_fixed_vlen >= 256
+fixed_bool32_t global_bool32;
+#endif
+
 //===----------------------------------------------------------------------===//
 // WRITES
 //===----------------------------------------------------------------------===//
@@ -39,6 +54,52 @@ fixed_int64m1_t global_i64;
 //
 void write_global_i64(vint64m1_t v) { global_i64 = v; }
 
+// CHECK-64-LABEL: @write_global_bool1(
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-64-NEXT:    store <vscale x 64 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    ret void
+//
+// CHECK-256-LABEL: @write_global_bool1(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
+// CHECK-256-NEXT:    store <vscale x 64 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    ret void
+//
+void write_global_bool1(vbool1_t v) { global_bool1 = v; }
+
+// CHECK-64-LABEL: @write_global_bool4(
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V:%.*]] to <vscale x 2 x i8>
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr @global_bool4, align 2, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    ret void
+//
+// CHECK-256-LABEL: @write_global_bool4(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V:%.*]] to <vscale x 2 x i8>
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool4, align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    ret void
+//
+void write_global_bool4(vbool4_t v) { global_bool4 = v; }
+
+#if __riscv_v_fixed_vlen >= 256
+// CHECK-256-LABEL: @write_global_bool32(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
+// CHECK-256-NEXT:    store <vscale x 2 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    store <1 x i8> [[TMP0]], ptr @global_bool32, align 1, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    ret void
+//
+void write_global_bool32(vbool32_t v) { global_bool32 = v; }
+#endif
+
 //===----------------------------------------------------------------------===//
 // READS
 //===----------------------------------------------------------------------===//
@@ -56,3 +117,49 @@ void write_global_i64(vint64m1_t v) { global_i64 = v; }
 // CHECK-256-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
 vint64m1_t read_global_i64() { return global_i64; }
+
+// CHECK-64-LABEL: @read_global_bool1(
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
+//
+// CHECK-256-LABEL: @read_global_bool1(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
+//
+vbool1_t read_global_bool1() { return global_bool1; }
+
+// CHECK-64-LABEL: @read_global_bool4(
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool4, align 2, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
+// CHECK-64-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+// CHECK-256-LABEL: @read_global_bool4(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool4, align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
+// CHECK-256-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+vbool4_t read_global_bool4() { return global_bool4; }
+
+#if __riscv_v_fixed_vlen >= 256
+// CHECK-256-LABEL: @read_global_bool32(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr @global_bool32, align 1, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+//
+vbool32_t read_global_bool32() { return global_bool32; }
+#endif
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c
index 85a320ba50d243..027f7ab24aa120 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c
@@ -8,6 +8,14 @@
 
 #include <stdint.h>
 
+typedef __rvv_bool64_t vbool64_t;
+typedef __rvv_bool32_t vbool32_t;
+typedef __rvv_bool16_t vbool16_t;
+typedef __rvv_bool8_t vbool8_t;
+typedef __rvv_bool4_t vbool4_t;
+typedef __rvv_bool2_t vbool2_t;
+typedef __rvv_bool1_t vbool1_t;
+
 typedef __rvv_int8mf8_t vint8mf8_t;
 typedef __rvv_uint8mf8_t vuint8mf8_t;
 
@@ -141,6 +149,20 @@ typedef vuint64m8_t fixed_uint64m8_t __attribute__((riscv_rvv_vector_bits(__risc
 typedef vfloat32m8_t fixed_float32m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 8)));
 typedef vfloat64m8_t fixed_float64m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * 8)));
 
+#if __riscv_v_fixed_vlen / 64 >= 8
+typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 64)));
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 32)));
+#endif
+#if __riscv_v_fixed_vlen / 16 >= 8
+typedef vbool16_t fixed_bool16_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 16)));
+#endif
+typedef vbool8_t fixed_bool8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 8)));
+typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 4)));
+typedef vbool2_t fixed_bool2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 2)));
+typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+
 //===----------------------------------------------------------------------===//
 // Structs and unions
 //===----------------------------------------------------------------------===//
@@ -198,6 +220,20 @@ DEFINE_STRUCT(uint64m8)
 DEFINE_STRUCT(float32m8)
 DEFINE_STRUCT(float64m8)
 
+DEFINE_STRUCT(bool1)
+DEFINE_STRUCT(bool2)
+DEFINE_STRUCT(bool4)
+DEFINE_STRUCT(bool8)
+#if __riscv_v_fixed_vlen / 16 >= 8
+DEFINE_STRUCT(bool16)
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+DEFINE_STRUCT(bool32)
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+DEFINE_STRUCT(bool64)
+#endif
+
 DEFINE_UNION(int8m1)
 DEFINE_UNION(int16m1)
 DEFINE_UNION(int32m1)
@@ -242,6 +278,20 @@ DEFINE_UNION(uint64m8)
 DEFINE_UNION(float32m8)
 DEFINE_UNION(float64m8)
 
+DEFINE_UNION(bool1)
+DEFINE_UNION(bool2)
+DEFINE_UNION(bool4)
+DEFINE_UNION(bool8)
+#if __riscv_v_fixed_vlen / 16 >= 8
+DEFINE_UNION(bool16)
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+DEFINE_UNION(bool32)
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+DEFINE_UNION(bool64)
+#endif
+
 //===----------------------------------------------------------------------===//
 // Global variables
 //===----------------------------------------------------------------------===//
@@ -297,6 +347,20 @@ fixed_uint64m8_t global_u64m8;
 fixed_float32m8_t global_f32m8;
 fixed_float64m8_t global_f64m8;
 
+fixed_bool1_t global_bool1;
+fixed_bool2_t global_bool2;
+fixed_bool4_t global_bool4;
+fixed_bool8_t global_bool8;
+#if __riscv_v_fixed_vlen / 16 >= 8
+fixed_bool16_t global_bool16;
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+fixed_bool32_t global_bool32;
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+fixed_bool64_t global_bool64;
+#endif
+
 //===----------------------------------------------------------------------===//
 // Global arrays
 //===----------------------------------------------------------------------===//
@@ -352,6 +416,20 @@ fixed_uint64m8_t global_arr_u64m8[3];
 fixed_float32m8_t global_arr_f32m8[3];
 fixed_float64m8_t global_arr_f64m8[3];
 
+fixed_bool1_t global_arr_bool1[3];
+fixed_bool2_t global_arr_bool2[3];
+fixed_bool4_t global_arr_bool4[3];
+fixed_bool8_t global_arr_bool8[3];
+#if __riscv_v_fixed_vlen / 16 >= 8
+fixed_bool16_t global_arr_bool16[3];
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+fixed_bool32_t global_arr_bool32[3];
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+fixed_bool64_t global_arr_bool64[3];
+#endif
+
 //===----------------------------------------------------------------------===//
 // Locals
 //===----------------------------------------------------------------------===//
@@ -401,6 +479,20 @@ void f() {
   fixed_float32m8_t local_f32m8;
   fixed_float64m8_t local_f64m8;
 
+  fixed_bool1_t local_bool1;
+  fixed_bool2_t local_bool2;
+  fixed_bool4_t local_bool4;
+  fixed_bool8_t local_bool8;
+#if __riscv_v_fixed_vlen / 16 >= 8
+  fixed_bool16_t local_bool16;
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+  fixed_bool32_t local_bool32;
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+  fixed_bool64_t local_bool64;
+#endif
+
   // Arrays
   fixed_int8m1_t local_arr_i8[3];
   fixed_int16m1_t local_arr_i16[3];
@@ -461,6 +553,20 @@ void f() {
 
   fixed_int8mf8_t local_arr_i8mf8[3];
   fixed_uint8mf8_t local_arr_u8mf8[3];
+
+  fixed_bool1_t local_arr_bool1[3];
+  fixed_bool2_t local_arr_bool2[3];
+  fixed_bool4_t local_arr_bool4[3];
+  fixed_bool8_t local_arr_bool8[3];
+#if __riscv_v_fixed_vlen / 16 >= 8
+  fixed_bool16_t local_arr_bool16[3];
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+  fixed_bool32_t local_arr_bool32[3];
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+  fixed_bool64_t local_arr_bool64[3];
+#endif
 }
 
 //===----------------------------------------------------------------------===//
@@ -506,6 +612,10 @@ void f() {
 // CHECK-64-NEXT: %struct.struct_uint64m8 = type { <8 x i64> }
 // CHECK-64-NEXT: %struct.struct_float32m8 = type { <16 x float> }
 // CHECK-64-NEXT: %struct.struct_float64m8 = type { <8 x double> }
+// CHECK-64-NEXT: %struct.struct_bool1 = type { <8 x i8> }
+// CHECK-64-NEXT: %struct.struct_bool2 = type { <4 x i8> }
+// CHECK-64-NEXT: %struct.struct_bool4 = type { <2 x i8> }
+// CHECK-64-NEXT: %struct.struct_bool8 = type { <1 x i8> }
 
 // CHECK-128:      %struct.struct_int8m1 = type { <16 x i8> }
 // CHECK-128-NEXT: %struct.struct_int16m1 = type { <8 x i16> }
@@ -547,6 +657,11 @@ void f() {
 // CHECK-128-NEXT: %struct.struct_uint64m8 = type { <16 x i64> }
 // CHECK-128-NEXT: %struct.struct_float32m8 = type { <32 x float> }
 // CHECK-128-NEXT: %struct.struct_float64m8 = type { <16 x double> }
+// CHECK-128-NEXT: %struct.struct_bool1 = type { <16 x i8> }
+// CHECK-128-NEXT: %struct.struct_bool2 = type { <8 x i8> }
+// CHECK-128-NEXT: %struct.struct_bool4 = type { <4 x i8> }
+// CHECK-128-NEXT: %struct.struct_bool8 = type { <2 x i8> }
+// CHECK-128-NEXT: %struct.struct_bool16 = type { <1 x i8> }
 
 // CHECK-256:      %struct.struct_int8m1 = type { <32 x i8> }
 // CHECK-256-NEXT: %struct.struct_int16m1 = type { <16 x i16> }
@@ -587,6 +702,13 @@ void f() {
 // CHECK-256-NEXT: %struct.struct_uint32m8 = type { <64 x i32> }
 // CHECK-256-NEXT: %struct.struct_uint64m8 = type { <32 x i64> }
 // CHECK-256-NEXT: %struct.struct_float32m8 = type { <64 x float> }
+// CHECK-256-NEXT: %struct.struct_float64m8 = type { <32 x double> }
+// CHECK-256-NEXT: %struct.struct_bool1 = type { <32 x i8> }
+// CHECK-256-NEXT: %struct.struct_bool2 = type { <16 x i8> }
+// CHECK-256-NEXT: %struct.struct_bool4 = type { <8 x i8> }
+// CHECK-256-NEXT: %struct.struct_bool8 = type { <4 x i8> }
+// CHECK-256-NEXT: %struct.struct_bool16 = type { <2 x i8> }
+// CHECK-256-NEXT: %struct.struct_bool32 = type { <1 x i8> }
 
 // CHECK-512:      %struct.struct_int8m1 = type { <64 x i8> }
 // CHECK-512-NEXT: %struct.struct_int16m1 = type { <32 x i16> }
@@ -627,6 +749,14 @@ void f() {
 // CHECK-512-NEXT: %struct.struct_uint32m8 = type { <128 x i32> }
 // CHECK-512-NEXT: %struct.struct_uint64m8 = type { <64 x i64> }
 // CHECK-512-NEXT: %struct.struct_float32m8 = type { <128 x float> }
+// CHECK-512-NEXT: %struct.struct_float64m8 = type { <64 x double> }
+// CHECK-512-NEXT: %struct.struct_bool1 = type { <64 x i8> }
+// CHECK-512-NEXT: %struct.struct_bool2 = type { <32 x i8> }
+// CHECK-512-NEXT: %struct.struct_bool4 = type { <16 x i8> }
+// CHECK-512-NEXT: %struct.struct_bool8 = type { <8 x i8> }
+// CHECK-512-NEXT: %struct.struct_bool16 = type { <4 x i8> }
+// CHECK-512-NEXT: %struct.struct_bool32 = type { <2 x i8> }
+// CHECK-512-NEXT: %struct.struct_bool64 = type { <1 x i8> }
 
 // CHECK-1024:      %struct.struct_int8m1 = type { <128 x i8> }
 // CHECK-1024-NEXT: %struct.struct_int16m1 = type { <64 x i16> }
@@ -667,6 +797,14 @@ void f() {
 // CHECK-1024-NEXT: %struct.struct_uint32m8 = type { <256 x i32> }
 // CHECK-1024-NEXT: %struct.struct_uint64m8 = type { <128 x i64> }
 // CHECK-1024-NEXT: %struct.struct_float32m8 = type { <256 x float> }
+// CHECK-1024-NEXT: %struct.struct_float64m8 = type { <128 x double> }
+// CHECK-1024-NEXT: %struct.struct_bool1 = type { <128 x i8> }
+// CHECK-1024-NEXT: %struct.struct_bool2 = type { <64 x i8> }
+// CHECK-1024-NEXT: %struct.struct_bool4 = type { <32 x i8> }
+// CHECK-1024-NEXT: %struct.struct_bool8 = type { <16 x i8> }
+// CHECK-1024-NEXT: %struct.struct_bool16 = type { <8 x i8> }
+// CHECK-1024-NEXT: %struct.struct_bool32 = type { <4 x i8> }
+// CHECK-1024-NEXT: %struct.struct_bool64 = type { <2 x i8> }
 
 // CHECK-64:      %union.union_int8m1 = type { <8 x i8> }
 // CHECK-64-NEXT: %union.union_int16m1 = type { <4 x i16> }
@@ -708,6 +846,10 @@ void f() {
 // CHECK-64-NEXT: %union.union_uint64m8 = type { <8 x i64> }
 // CHECK-64-NEXT: %union.union_float32m8 = type { <16 x float> }
 // CHECK-64-NEXT: %union.union_float64m8 = type { <8 x double> }
+// CHECK-64-NEXT: %union.union_bool1 = type { <8 x i8> }
+// CHECK-64-NEXT: %union.union_bool2 = type { <4 x i8> }
+// CHECK-64-NEXT: %union.union_bool4 = type { <2 x i8> }
+// CHECK-64-NEXT: %union.union_bool8 = type { <1 x i8> }
 
 // CHECK-128:      %union.union_int8m1 = type { <16 x i8> }
 // CHECK-128-NEXT: %union.union_int16m1 = type { <8 x i16> }
@@ -749,6 +891,11 @@ void f() {
 // CHECK-128-NEXT: %union.union_uint64m8 = type { <16 x i64> }
 // CHECK-128-NEXT: %union.union_float32m8 = type { <32 x float> }
 // CHECK-128-NEXT: %union.union_float64m8 = type { <16 x double> }
+// CHECK-128-NEXT: %union.union_bool1 = type { <16 x i8> }
+// CHECK-128-NEXT: %union.union_bool2 = type { <8 x i8> }
+// CHECK-128-NEXT: %union.union_bool4 = type { <4 x i8> }
+// CHECK-128-NEXT: %union.union_bool8 = type { <2 x i8> }
+// CHECK-128-NEXT: %union.union_bool16 = type { <1 x i8> }
 
 // CHECK-256:      %union.union_int8m1 = type { <32 x i8> }
 // CHECK-256-NEXT: %union.union_int16m1 = type { <16 x i16> }
@@ -790,6 +937,12 @@ void f() {
 // CHECK-256-NEXT: %union.union_uint64m8 = type { <32 x i64> }
 // CHECK-256-NEXT: %union.union_float32m8 = type { <64 x float> }
 // CHECK-256-NEXT: %union.union_float64m8 = type { <32 x double> }
+// CHECK-256-NEXT: %union.union_bool1 = type { <32 x i8> }
+// CHECK-256-NEXT: %union.union_bool2 = type { <16 x i8> }
+// CHECK-256-NEXT: %union.union_bool4 = type { <8 x i8> }
+// CHECK-256-NEXT: %union.union_bool8 = type { <4 x i8> }
+// CHECK-256-NEXT: %union.union_bool16 = type { <2 x i8> }
+// CHECK-256-NEXT: %union.union_bool32 = type { <1 x i8> }
 
 // CHECK-512:      %union.union_int8m1 = type { <64 x i8> }
 // CHECK-512-NEXT: %union.union_int16m1 = type { <32 x i16> }
@@ -831,6 +984,13 @@ void f() {
 // CHECK-512-NEXT: %union.union_uint64m8 = type { <64 x i64> }
 // CHECK-512-NEXT: %union.union_float32m8 = type { <128 x float> }
 // CHECK-512-NEXT: %union.union_float64m8 = type { <64 x double> }
+// CHECK-512-NEXT: %union.union_bool1 = type { <64 x i8> }
+// CHECK-512-NEXT: %union.union_bool2 = type { <32 x i8> }
+// CHECK-512-NEXT: %union.union_bool4 = type { <16 x i8> }
+// CHECK-512-NEXT: %union.union_bool8 = type { <8 x i8> }
+// CHECK-512-NEXT: %union.union_bool16 = type { <4 x i8> }
+// CHECK-512-NEXT: %union.union_bool32 = type { <2 x i8> }
+// CHECK-512-NEXT: %union.union_bool64 = type { <1 x i8> }
 
 // CHECK-1024:      %union.union_int8m1 = type { <128 x i8> }
 // CHECK-1024-NEXT: %union.union_int16m1 = type { <64 x i16> }
@@ -872,6 +1032,13 @@ void f() {
 // CHECK-1024-NEXT: %union.union_uint64m8 = type { <128 x i64> }
 // CHECK-1024-NEXT: %union.union_float32m8 = type { <256 x float> }
 // CHECK-1024-NEXT: %union.union_float64m8 = type { <128 x double> }
+// CHECK-1024-NEXT: %union.union_bool1 = type { <128 x i8> }
+// CHECK-1024-NEXT: %union.union_bool2 = type { <64 x i8> }
+// CHECK-1024-NEXT: %union.union_bool4 = type { <32 x i8> }
+// CHECK-1024-NEXT: %union.union_bool8 = type { <16 x i8> }
+// CHECK-1024-NEXT: %union.union_bool16 = type { <8 x i8> }
+// CHECK-1024-NEXT: %union.union_bool32 = type { <4 x i8> }
+// CHECK-1024-NEXT: %union.union_bool64 = type { <2 x i8> }
 
 //===----------------------------------------------------------------------===//
 // Global variables
@@ -916,6 +1083,10 @@ void f() {
 // CHECK-64-NEXT: @global_u64m8 ={{.*}} global <8 x i64> zeroinitializer, align 8
 // CHECK-64-NEXT: @global_f32m8 ={{.*}} global <16 x float> zeroinitializer, align 8
 // CHECK-64-NEXT: @global_f64m8 ={{.*}} global <8 x double> zeroinitializer, align 8
+// CHECK-64-NEXT: @global_bool1 ={{.*}} global <8 x i8> zeroinitializer, align 8
+// CHECK-64-NEXT: @global_bool2 ={{.*}} global <4 x i8> zeroinitializer, align 4
+// CHECK-64-NEXT: @global_bool4 ={{.*}} global <2 x i8> zeroinitializer, align 2
+// CHECK-64-NEXT: @global_bool8 ={{.*}} global <1 x i8> zeroinitializer, align 1
 
 // CHECK-128:      @global_i8 ={{.*}} global <16 x i8> zeroinitializer, align 8
 // CHECK-128-NEXT: @global_i16 ={{.*}} global <8 x i16> zeroinitializer, align 8
@@ -957,6 +1128,11 @@ void f() {
 // CHECK-128-NEXT: @global_u64m8 ={{.*}} global <16 x i64> zeroinitializer, align 8
 // CHECK-128-NEXT: @global_f32m8 ={{.*}} global <32 x float> zeroinitializer, align 8
 // CHECK-128-NEXT: @global_f64m8 ={{.*}} global <16 x double> zeroinitializer, align 8
+// CHECK-128-NEXT: @global_bool1 ={{.*}} global <16 x i8> zeroinitializer, align 8
+// CHECK-128-NEXT: @global_bool2 ={{.*}} global <8 x i8> zeroinitializer, align 8
+// CHECK-128-NEXT: @global_bool4 ={{.*}} global <4 x i8> zeroinitializer, align 4
+// CHECK-128-NEXT: @global_bool8 ={{.*}} global <2 x i8> zeroinitializer, align 2
+// CHECK-128-NEXT: @global_bool16 ={{.*}} global <1 x i8> zeroinitializer, align 1
 
 // CHECK-256:      @global_i8 ={{.*}} global <32 x i8> zeroinitializer, align 8
 // CHECK-256-NEXT: @global_i16 ={{.*}} global <16 x i16> zeroinitializer, align 8
@@ -998,6 +1174,12 @@ void f() {
 // CHECK-256-NEXT: @global_u64m8 ={{.*}} global <32 x i64> zeroinitializer, align 8
 // CHECK-256-NEXT: @global_f32m8 ={{.*}} global <64 x float> zeroinitializer, align 8
 // CHECK-256-NEXT: @global_f64m8 ={{.*}} global <32 x double> zeroinitializer, align 8
+// CHECK-256-NEXT: @global_bool1 ={{.*}} global <32 x i8> zeroinitializer, align 8
+// CHECK-256-NEXT: @global_bool2 ={{.*}} global <16 x i8> zeroinitializer, align 8
+// CHECK-256-NEXT: @global_bool4 ={{.*}} global <8 x i8> zeroinitializer, align 8
+// CHECK-256-NEXT: @global_bool8 ={{.*}} global <4 x i8> zeroinitializer, align 4
+// CHECK-256-NEXT: @global_bool16 ={{.*}} global <2 x i8> zeroinitializer, align 2
+// CHECK-256-NEXT: @global_bool32 ={{.*}} global <1 x i8> zeroinitializer, align 1
 
 // CHECK-512:      @global_i8 ={{.*}} global <64 x i8> zeroinitializer, align 8
 // CHECK-512-NEXT: @global_i16 ={{.*}} global <32 x i16> zeroinitializer, align 8
@@ -1039,6 +1221,13 @@ void f() {
 // CHECK-512-NEXT: @global_u64m8 ={{.*}} global <64 x i64> zeroinitializer, align 8
 // CHECK-512-NEXT: @global_f32m8 ={{.*}} global <128 x float> zeroinitializer, align 8
 // CHECK-512-NEXT: @global_f64m8 ={{.*}} global <64 x double> zeroinitializer, align 8
+// CHECK-512-NEXT: @global_bool1 ={{.*}} global <64 x i8> zeroinitializer, align 8
+// CHECK-512-NEXT: @global_bool2 ={{.*}} global <32 x i8> zeroinitializer, align 8
+// CHECK-512-NEXT: @global_bool4 ={{.*}} global <16 x i8> zeroinitializer, align 8
+// CHECK-512-NEXT: @global_bool8 ={{.*}} global <8 x i8> zeroinitializer, align 8
+// CHECK-512-NEXT: @global_bool16 ={{.*}} global <4 x i8> zeroinitializer, align 4
+// CHECK-512-NEXT: @global_bool32 ={{.*}} global <2 x i8> zeroinitializer, align 2
+// CHECK-512-NEXT: @global_bool64 ={{.*}} global <1 x i8> zeroinitializer, align 1
 
 // CHECK-1024:      @global_i8 ={{.*}} global <128 x i8> zeroinitializer, align 8
 // CHECK-1024-NEXT: @global_i16 ={{.*}} global <64 x i16> zeroinitializer, align 8
@@ -1080,6 +1269,13 @@ void f() {
 // CHECK-1024-NEXT: @global_u64m8 ={{.*}} global <128 x i64> zeroinitializer, align 8
 // CHECK-1024-NEXT: @global_f32m8 ={{.*}} global <256 x float> zeroinitializer, align 8
 // CHECK-1024-NEXT: @global_f64m8 ={{.*}} global <128 x double> zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_bool1 ={{.*}} global <128 x i8> zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_bool2 ={{.*}} global <64 x i8> zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_bool4 ={{.*}} global <32 x i8> zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_bool8 ={{.*}} global <16 x i8> zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_bool16 ={{.*}} global <8 x i8> zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_bool32 ={{.*}} global <4 x i8> zeroinitializer, align 4
+// CHECK-1024-NEXT: @global_bool64 ={{.*}} global <2 x i8> zeroinitializer, align 2
 
 //===----------------------------------------------------------------------===//
 // Global arrays
@@ -1124,6 +1320,10 @@ void f() {
 // CHECK-64-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <8 x i64>] zeroinitializer, align 8
 // CHECK-64-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <16 x float>] zeroinitializer, align 8
 // CHECK-64-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <8 x double>] zeroinitializer, align 8
+// CHECK-64-NEXT: @global_arr_bool1 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8
+// CHECK-64-NEXT: @global_arr_bool2 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4
+// CHECK-64-NEXT: @global_arr_bool4 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2
+// CHECK-64-NEXT: @global_arr_bool8 ={{.*}} global [3 x <1 x i8>] zeroinitializer, align 1
 
 // CHECK-128:      @global_arr_i8 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8
 // CHECK-128-NEXT: @global_arr_i16 ={{.*}} global [3 x <8 x i16>] zeroinitializer, align 8
@@ -1165,6 +1365,11 @@ void f() {
 // CHECK-128-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <16 x i64>] zeroinitializer, align 8
 // CHECK-128-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <32 x float>] zeroinitializer, align 8
 // CHECK-128-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <16 x double>] zeroinitializer, align 8
+// CHECK-128-NEXT: @global_arr_bool1 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8
+// CHECK-128-NEXT: @global_arr_bool2 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8
+// CHECK-128-NEXT: @global_arr_bool4 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4
+// CHECK-128-NEXT: @global_arr_bool8 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2
+// CHECK-128-NEXT: @global_arr_bool16 ={{.*}} global [3 x <1 x i8>] zeroinitializer, align 1
 
 // CHECK-256:      @global_arr_i8 ={{.*}} global [3 x <32 x i8>] zeroinitializer, align 8
 // CHECK-256-NEXT: @global_arr_i16 ={{.*}} global [3 x <16 x i16>] zeroinitializer, align 8
@@ -1206,6 +1411,13 @@ void f() {
 // CHECK-256-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <32 x i64>] zeroinitializer, align 8
 // CHECK-256-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <64 x float>] zeroinitializer, align 8
 // CHECK-256-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <32 x double>] zeroinitializer, align 8
+// CHECK-256-NEXT: @global_arr_bool1 ={{.*}} global [3 x <32 x i8>] zeroinitializer, align 8
+// CHECK-256-NEXT: @global_arr_bool2 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8
+// CHECK-256-NEXT: @global_arr_bool4 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8
+// CHECK-256-NEXT: @global_arr_bool8 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4
+// CHECK-256-NEXT: @global_arr_bool16 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2
+// CHECK-256-NEXT: @global_arr_bool32 ={{.*}} global [3 x <1 x i8>] zeroinitializer, align 1
+
 // CHECK-512:      @global_arr_i8 ={{.*}} global [3 x <64 x i8>] zeroinitializer, align 8
 // CHECK-512-NEXT: @global_arr_i16 ={{.*}} global [3 x <32 x i16>] zeroinitializer, align 8
 // CHECK-512-NEXT: @global_arr_i32 ={{.*}} global [3 x <16 x i32>] zeroinitializer, align 8
@@ -1246,6 +1458,13 @@ void f() {
 // CHECK-512-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <64 x i64>] zeroinitializer, align 8
 // CHECK-512-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <128 x float>] zeroinitializer, align 8
 // CHECK-512-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <64 x double>] zeroinitializer, align 8
+// CHECK-512-NEXT: @global_arr_bool1 ={{.*}} global [3 x <64 x i8>] zeroinitializer, align 8
+// CHECK-512-NEXT: @global_arr_bool2 ={{.*}} global [3 x <32 x i8>] zeroinitializer, align 8
+// CHECK-512-NEXT: @global_arr_bool4 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8
+// CHECK-512-NEXT: @global_arr_bool8 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8
+// CHECK-512-NEXT: @global_arr_bool16 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4
+// CHECK-512-NEXT: @global_arr_bool32 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2
+// CHECK-512-NEXT: @global_arr_bool64 ={{.*}} global [3 x <1 x i8>] zeroinitializer, align 1
 
 // CHECK-1024:      @global_arr_i8 ={{.*}} global [3 x <128 x i8>] zeroinitializer, align 8
 // CHECK-1024-NEXT: @global_arr_i16 ={{.*}} global [3 x <64 x i16>] zeroinitializer, align 8
@@ -1287,6 +1506,13 @@ void f() {
 // CHECK-1024-NEXT: @global_arr_u64m8 ={{.*}} global [3 x <128 x i64>] zeroinitializer, align 8
 // CHECK-1024-NEXT: @global_arr_f32m8 ={{.*}} global [3 x <256 x float>] zeroinitializer, align 8
 // CHECK-1024-NEXT: @global_arr_f64m8 ={{.*}} global [3 x <128 x double>] zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_arr_bool1 ={{.*}} global [3 x <128 x i8>] zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_arr_bool2 ={{.*}} global [3 x <64 x i8>] zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_arr_bool4 ={{.*}} global [3 x <32 x i8>] zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_arr_bool8 ={{.*}} global [3 x <16 x i8>] zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_arr_bool16 ={{.*}} global [3 x <8 x i8>] zeroinitializer, align 8
+// CHECK-1024-NEXT: @global_arr_bool32 ={{.*}} global [3 x <4 x i8>] zeroinitializer, align 4
+// CHECK-1024-NEXT: @global_arr_bool64 ={{.*}} global [3 x <2 x i8>] zeroinitializer, align 2
 
 //===----------------------------------------------------------------------===//
 // Local variables
@@ -1331,6 +1557,10 @@ void f() {
 // CHECK-64-NEXT: %local_u64m8 = alloca <8 x i64>, align 8
 // CHECK-64-NEXT: %local_f32m8 = alloca <16 x float>, align 8
 // CHECK-64-NEXT: %local_f64m8 = alloca <8 x double>, align 8
+// CHECK-64-NEXT: %local_bool1 = alloca <8 x i8>, align 8
+// CHECK-64-NEXT: %local_bool2 = alloca <4 x i8>, align 4
+// CHECK-64-NEXT: %local_bool4 = alloca <2 x i8>, align 2
+// CHECK-64-NEXT: %local_bool8 = alloca <1 x i8>, align 1
 
 // CHECK-128:      %local_i8 = alloca <16 x i8>, align 8
 // CHECK-128-NEXT: %local_i16 = alloca <8 x i16>, align 8
@@ -1372,6 +1602,11 @@ void f() {
 // CHECK-128-NEXT: %local_u64m8 = alloca <16 x i64>, align 8
 // CHECK-128-NEXT: %local_f32m8 = alloca <32 x float>, align 8
 // CHECK-128-NEXT: %local_f64m8 = alloca <16 x double>, align 8
+// CHECK-128-NEXT: %local_bool1 = alloca <16 x i8>, align 8
+// CHECK-128-NEXT: %local_bool2 = alloca <8 x i8>, align 8
+// CHECK-128-NEXT: %local_bool4 = alloca <4 x i8>, align 4
+// CHECK-128-NEXT: %local_bool8 = alloca <2 x i8>, align 2
+// CHECK-128-NEXT: %local_bool16 = alloca <1 x i8>, align 1
 
 // CHECK-256:      %local_i8 = alloca <32 x i8>, align 8
 // CHECK-256-NEXT: %local_i16 = alloca <16 x i16>, align 8
@@ -1413,6 +1648,12 @@ void f() {
 // CHECK-256-NEXT: %local_u64m8 = alloca <32 x i64>, align 8
 // CHECK-256-NEXT: %local_f32m8 = alloca <64 x float>, align 8
 // CHECK-256-NEXT: %local_f64m8 = alloca <32 x double>, align 8
+// CHECK-256-NEXT: %local_bool1 = alloca <32 x i8>, align 8
+// CHECK-256-NEXT: %local_bool2 = alloca <16 x i8>, align 8
+// CHECK-256-NEXT: %local_bool4 = alloca <8 x i8>, align 8
+// CHECK-256-NEXT: %local_bool8 = alloca <4 x i8>, align 4
+// CHECK-256-NEXT: %local_bool16 = alloca <2 x i8>, align 2
+// CHECK-256-NEXT: %local_bool32 = alloca <1 x i8>, align 1
 
 // CHECK-512:      %local_i8 = alloca <64 x i8>, align 8
 // CHECK-512-NEXT: %local_i16 = alloca <32 x i16>, align 8
@@ -1454,6 +1695,13 @@ void f() {
 // CHECK-512-NEXT: %local_u64m8 = alloca <64 x i64>, align 8
 // CHECK-512-NEXT: %local_f32m8 = alloca <128 x float>, align 8
 // CHECK-512-NEXT: %local_f64m8 = alloca <64 x double>, align 8
+// CHECK-512-NEXT: %local_bool1 = alloca <64 x i8>, align 8
+// CHECK-512-NEXT: %local_bool2 = alloca <32 x i8>, align 8
+// CHECK-512-NEXT: %local_bool4 = alloca <16 x i8>, align 8
+// CHECK-512-NEXT: %local_bool8 = alloca <8 x i8>, align 8
+// CHECK-512-NEXT: %local_bool16 = alloca <4 x i8>, align 4
+// CHECK-512-NEXT: %local_bool32 = alloca <2 x i8>, align 2
+// CHECK-512-NEXT: %local_bool64 = alloca <1 x i8>, align 1
 
 // CHECK-1024:       %local_i8 = alloca <128 x i8>, align 8
 // CHECK-1024-NEXT:  %local_i16 = alloca <64 x i16>, align 8
@@ -1495,6 +1743,13 @@ void f() {
 // CHECK-1024-NEXT:  %local_u64m8 = alloca <128 x i64>, align 8
 // CHECK-1024-NEXT:  %local_f32m8 = alloca <256 x float>, align 8
 // CHECK-1024-NEXT:  %local_f64m8 = alloca <128 x double>, align 8
+// CHECK-1024-NEXT: %local_bool1 = alloca <128 x i8>, align 8
+// CHECK-1024-NEXT: %local_bool2 = alloca <64 x i8>, align 8
+// CHECK-1024-NEXT: %local_bool4 = alloca <32 x i8>, align 8
+// CHECK-1024-NEXT: %local_bool8 = alloca <16 x i8>, align 8
+// CHECK-1024-NEXT: %local_bool16 = alloca <8 x i8>, align 8
+// CHECK-1024-NEXT: %local_bool32 = alloca <4 x i8>, align 4
+// CHECK-1024-NEXT: %local_bool64 = alloca <2 x i8>, align 2
 
 //===----------------------------------------------------------------------===//
 // Local arrays
@@ -1552,6 +1807,10 @@ void f() {
 // CHECK-64-NEXT: %local_arr_u16mf4 = alloca [3 x <1 x i16>], align 2
 // CHECK-64-NEXT: %local_arr_i8mf8 = alloca [3 x <1 x i8>], align 1
 // CHECK-64-NEXT: %local_arr_u8mf8 = alloca [3 x <1 x i8>], align 1
+// CHECK-64-NEXT: %local_arr_bool1 = alloca [3 x <8 x i8>], align 8
+// CHECK-64-NEXT: %local_arr_bool2 = alloca [3 x <4 x i8>], align 4
+// CHECK-64-NEXT: %local_arr_bool4 = alloca [3 x <2 x i8>], align 2
+// CHECK-64-NEXT: %local_arr_bool8 = alloca [3 x <1 x i8>], align 1
 
 // CHECK-128:      %local_arr_i8 = alloca [3 x <16 x i8>], align 8
 // CHECK-128-NEXT: %local_arr_i16 = alloca [3 x <8 x i16>], align 8
@@ -1606,6 +1865,11 @@ void f() {
 // CHECK-128-NEXT: %local_arr_u16mf4 = alloca [3 x <2 x i16>], align 4
 // CHECK-128-NEXT: %local_arr_i8mf8 = alloca [3 x <2 x i8>], align 2
 // CHECK-128-NEXT: %local_arr_u8mf8 = alloca [3 x <2 x i8>], align 2
+// CHECK-128-NEXT: %local_arr_bool1 = alloca [3 x <16 x i8>], align 8
+// CHECK-128-NEXT: %local_arr_bool2 = alloca [3 x <8 x i8>], align 8
+// CHECK-128-NEXT: %local_arr_bool4 = alloca [3 x <4 x i8>], align 4
+// CHECK-128-NEXT: %local_arr_bool8 = alloca [3 x <2 x i8>], align 2
+// CHECK-128-NEXT: %local_arr_bool16 = alloca [3 x <1 x i8>], align 1
 
 // CHECK-256:      %local_arr_i8 = alloca [3 x <32 x i8>], align 8
 // CHECK-256-NEXT: %local_arr_i16 = alloca [3 x <16 x i16>], align 8
@@ -1660,6 +1924,12 @@ void f() {
 // CHECK-256-NEXT: %local_arr_u16mf4 = alloca [3 x <4 x i16>], align 8
 // CHECK-256-NEXT: %local_arr_i8mf8 = alloca [3 x <4 x i8>], align 4
 // CHECK-256-NEXT: %local_arr_u8mf8 = alloca [3 x <4 x i8>], align 4
+// CHECK-256-NEXT: %local_arr_bool1 = alloca [3 x <32 x i8>], align 8
+// CHECK-256-NEXT: %local_arr_bool2 = alloca [3 x <16 x i8>], align 8
+// CHECK-256-NEXT: %local_arr_bool4 = alloca [3 x <8 x i8>], align 8
+// CHECK-256-NEXT: %local_arr_bool8 = alloca [3 x <4 x i8>], align 4
+// CHECK-256-NEXT: %local_arr_bool16 = alloca [3 x <2 x i8>], align 2
+// CHECK-256-NEXT: %local_arr_bool32 = alloca [3 x <1 x i8>], align 1
 
 // CHECK-512:      %local_arr_i8 = alloca [3 x <64 x i8>], align 8
 // CHECK-512-NEXT: %local_arr_i16 = alloca [3 x <32 x i16>], align 8
@@ -1714,6 +1984,13 @@ void f() {
 // CHECK-512-NEXT: %local_arr_u16mf4 = alloca [3 x <8 x i16>], align 8
 // CHECK-512-NEXT: %local_arr_i8mf8 = alloca [3 x <8 x i8>], align 8
 // CHECK-512-NEXT: %local_arr_u8mf8 = alloca [3 x <8 x i8>], align 8
+// CHECK-512-NEXT: %local_arr_bool1 = alloca [3 x <64 x i8>], align 8
+// CHECK-512-NEXT: %local_arr_bool2 = alloca [3 x <32 x i8>], align 8
+// CHECK-512-NEXT: %local_arr_bool4 = alloca [3 x <16 x i8>], align 8
+// CHECK-512-NEXT: %local_arr_bool8 = alloca [3 x <8 x i8>], align 8
+// CHECK-512-NEXT: %local_arr_bool16 = alloca [3 x <4 x i8>], align 4
+// CHECK-512-NEXT: %local_arr_bool32 = alloca [3 x <2 x i8>], align 2
+// CHECK-512-NEXT: %local_arr_bool64 = alloca [3 x <1 x i8>], align 1
 
 // CHECK-1024:       %local_arr_i8 = alloca [3 x <128 x i8>], align 8
 // CHECK-1024-NEXT:  %local_arr_i16 = alloca [3 x <64 x i16>], align 8
@@ -1768,3 +2045,10 @@ void f() {
 // CHECK-1024-NEXT: %local_arr_u16mf4 = alloca [3 x <16 x i16>], align 8
 // CHECK-1024-NEXT: %local_arr_i8mf8 = alloca [3 x <16 x i8>], align 8
 // CHECK-1024-NEXT: %local_arr_u8mf8 = alloca [3 x <16 x i8>], align 8
+// CHECK-1024-NEXT: %local_arr_bool1 = alloca [3 x <128 x i8>], align 8
+// CHECK-1024-NEXT: %local_arr_bool2 = alloca [3 x <64 x i8>], align 8
+// CHECK-1024-NEXT: %local_arr_bool4 = alloca [3 x <32 x i8>], align 8
+// CHECK-1024-NEXT: %local_arr_bool8 = alloca [3 x <16 x i8>], align 8
+// CHECK-1024-NEXT: %local_arr_bool16 = alloca [3 x <8 x i8>], align 8
+// CHECK-1024-NEXT: %local_arr_bool32 = alloca [3 x <4 x i8>], align 4
+// CHECK-1024-NEXT: %local_arr_bool64 = alloca [3 x <2 x i8>], align 2
diff --git a/clang/test/CodeGen/fat-lto-objects.c b/clang/test/CodeGen/fat-lto-objects.c
index afce798c5c8194..b50567c024fc8c 100644
--- a/clang/test/CodeGen/fat-lto-objects.c
+++ b/clang/test/CodeGen/fat-lto-objects.c
@@ -11,10 +11,11 @@
 // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.full.split.bc %t.full.split.o
 // RUN: llvm-dis %t.full.split.bc -o - | FileCheck %s --check-prefixes=FULL,SPLIT,NOUNIFIED
 
+/// Full LTO always sets EnableSplitLTOUnit when the summary is used.
 // RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -emit-obj < %s -o %t.full.nosplit.o
 // RUN: llvm-readelf -S %t.full.nosplit.o | FileCheck %s --check-prefixes=ELF
 // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.full.nosplit.bc %t.full.nosplit.o
-// RUN: llvm-dis %t.full.nosplit.bc -o - | FileCheck %s --check-prefixes=FULL,NOSPLIT,NOUNIFIED
+// RUN: llvm-dis %t.full.nosplit.bc -o - | FileCheck %s --check-prefixes=FULL,SPLIT,NOUNIFIED
 
 // RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -fsplit-lto-unit -ffat-lto-objects -emit-obj < %s -o %t.thin.split.o
 // RUN: llvm-readelf -S %t.thin.split.o | FileCheck %s --check-prefixes=ELF
@@ -34,6 +35,21 @@
 // RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -fsplit-lto-unit -S < %s -o - \
 // RUN: | FileCheck %s --check-prefixes=ASM
 
+/// Make sure that FatLTO generates .llvm.lto sections that are the same as the output from normal LTO compilations
+// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=full -ffat-lto-objects -c %s -o %t.fatlto.full.o
+// RUN: llvm-objcopy --dump-section=.llvm.lto=%t.fatlto.full.bc %t.fatlto.full.o
+// RUN: llvm-dis < %t.fatlto.full.bc -o %t.fatlto.full.ll
+// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=full -c %s -o %t.nofat.full.bc
+// RUN: llvm-dis < %t.nofat.full.bc -o %t.nofat.full.ll
+// RUN: diff %t.fatlto.full.ll %t.nofat.full.ll
+
+// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=thin -ffat-lto-objects -c %s -o %t.fatlto.thin.o
+// RUN: llvm-objcopy --dump-section=.llvm.lto=%t.fatlto.thin.bc %t.fatlto.thin.o
+// RUN: llvm-dis < %t.fatlto.thin.bc -o %t.fatlto.thin.ll
+// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=thin -c %s -o %t.nofat.thin.bc
+// RUN: llvm-dis < %t.nofat.thin.bc -o %t.nofat.thin.ll
+// RUN: diff %t.fatlto.thin.ll %t.nofat.thin.ll
+
 /// Be sure we enable split LTO units correctly under -ffat-lto-objects.
 //   SPLIT: ![[#]] = !{i32 1, !"EnableSplitLTOUnit", i32 1}
 // NOSPLIT: ![[#]] = !{i32 1, !"EnableSplitLTOUnit", i32 0}
@@ -51,6 +67,9 @@
 // ASM-NEXT:        .asciz  "BC
 // ASM-NEXT: .size   .Lllvm.embedded.object
 
+const char* foo = "foo";
+
 int test(void) {
+  const char* bar = "bar";
   return 0xabcd;
 }
diff --git a/clang/test/CodeGenCXX/auto-var-init.cpp b/clang/test/CodeGenCXX/auto-var-init.cpp
index e5a9d015f22f27..2ef3409f014e98 100644
--- a/clang/test/CodeGenCXX/auto-var-init.cpp
+++ b/clang/test/CodeGenCXX/auto-var-init.cpp
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,CHECK-O0
 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O0,PATTERN,PATTERN-O0
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O1,PATTERN,PATTERN-O1
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=PATTERN,PATTERN-O1
 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=zero %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O0,ZERO,ZERO-O0
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=zero %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O1,ZERO,ZERO-O1
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=zero %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=ZERO,ZERO-O1
 // RUN: %clang_cc1 -std=c++14 -triple i386-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O0,PATTERN,PATTERN-O0
 
 #pragma clang diagnostic ignored "-Winaccessible-base"
@@ -1303,9 +1303,10 @@ TEST_CUSTOM(semivolatile, semivolatile, { 0x44444444, 0x44444444 });
 // CHECK-O0:  call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:  call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:  store i32 1145324612, ptr %custom, align 4
-// CHECK-O1-NEXT:  %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 4
-// CHECK-O1-NEXT:  store i32 1145324612, ptr %[[I]], align 4
+// PATTERN-O1:       store i32 1145324612, ptr %custom, align 4
+// PATTERN-O1-NEXT:  %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 4
+// PATTERN-O1-NEXT:  store i32 1145324612, ptr %[[I]], align 4
+// ZERO-O1:          store i64 4919131752989213764, ptr %custom, align 8
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(semivolatileinit, semivolatileinit);
@@ -1418,7 +1419,8 @@ TEST_CUSTOM(matching, matching, { .f = 0xf00f });
 // CHECK-O0:  call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:  call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:  store float 6.145500e+04, ptr {{.*}}, align 4
+// PATTERN-O1:  store float 6.145500e+04, ptr {{.*}}, align 4
+// ZERO-O1:     store i32 1198526208, ptr %custom, align 4
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(matchingreverse, matchingreverse);
@@ -1445,7 +1447,8 @@ TEST_CUSTOM(matchingreverse, matchingreverse, { .i = 0xf00f });
 // CHECK-O0:    call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:    call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:    store i32 61455, ptr %custom, align 4
+// PATTERN-O1:  store i32 61455, ptr %custom, align 4
+// ZERO-O1:     store i32 61455, ptr %custom, align 4
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(unmatched, unmatched);
@@ -1471,7 +1474,8 @@ TEST_CUSTOM(unmatched, unmatched, { .i = 0x3badbeef });
 // CHECK-O0:    call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:    call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:    store i32 1001242351, ptr {{.*}}, align 4
+// PATTERN-O1:  store i32 1001242351, ptr {{.*}}, align 4
+// ZERO-O1:     store i32 1001242351, ptr {{.*}}, align 4
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(unmatchedreverse, unmatchedreverse);
@@ -1504,9 +1508,7 @@ TEST_CUSTOM(unmatchedreverse, unmatchedreverse, { .c = 42  });
 // PATTERN-O1-NEXT:  store i8 -86, ptr %[[I]], align {{.*}}
 // PATTERN-O1-NEXT:  %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 3
 // PATTERN-O1-NEXT:  store i8 -86, ptr %[[I]], align {{.*}}
-// ZERO-O1:     store i8 42, ptr {{.*}}, align 4
-// ZERO-O1-NEXT:  %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 1
-// ZERO-O1-NEXT:  call void @llvm.memset.{{.*}}({{.*}}, i8 0, i64 3, {{.*}})
+// ZERO-O1:     store i32 42, ptr {{.*}}, align 4
 
 TEST_UNINIT(unmatchedfp, unmatchedfp);
 // CHECK-LABEL: @test_unmatchedfp_uninit()
@@ -1531,7 +1533,8 @@ TEST_CUSTOM(unmatchedfp, unmatchedfp, { .d = 3.1415926535897932384626433 });
 // CHECK-O0:    call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:    call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:    store double 0x400921FB54442D18, ptr %custom, align 8
+// PATTERN-O1:  store double 0x400921FB54442D18, ptr %custom, align 8
+// ZERO-O1:     store i64 4614256656552045848, ptr %custom, align 8
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(emptyenum, emptyenum);
diff --git a/clang/test/CodeGenCXX/cxx2b-static-call-operator.cpp b/clang/test/CodeGenCXX/cxx2b-static-call-operator.cpp
index fd53649c9b0618..9cf5a7e00e7b4e 100644
--- a/clang/test/CodeGenCXX/cxx2b-static-call-operator.cpp
+++ b/clang/test/CodeGenCXX/cxx2b-static-call-operator.cpp
@@ -19,16 +19,22 @@ void CallsTheLambda() {
 
 // CHECK:      define {{.*}}CallsTheLambda{{.*}}
 // CHECK-NEXT: entry:
-// CHECK-NEXT:   %call = call noundef i32 {{.*}}(i32 noundef 1, i32 noundef 2)
+// CHECK:        {{.*}}call {{.*}}GetALambda{{.*}}()
+// CHECK-NEXT:   {{.*}} = call noundef i32 {{.*}}(i32 noundef 1, i32 noundef 2)
 // CHECK-NEXT:   ret void
 // CHECK-NEXT: }
 
+Functor GetAFunctor() {
+  return {};
+}
+
 void call_static_call_operator() {
   Functor f;
   f(101, 102);
   f.operator()(201, 202);
   Functor{}(301, 302);
   Functor::operator()(401, 402);
+  GetAFunctor()(501, 502);
 }
 
 // CHECK:      define {{.*}}call_static_call_operator{{.*}}
@@ -37,6 +43,8 @@ void call_static_call_operator() {
 // CHECK-NEXT:   {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 201, i32 noundef 202)
 // CHECK-NEXT:   {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 301, i32 noundef 302)
 // CHECK-NEXT:   {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 401, i32 noundef 402)
+// CHECK:        {{.*}}call {{.*}}GetAFunctor{{.*}}()
+// CHECK-NEXT:   {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 501, i32 noundef 502)
 // CHECK-NEXT:   ret void
 // CHECK-NEXT: }
 
@@ -106,12 +114,16 @@ void test_dep_functors() {
 
 // CHECK:      define {{.*}}test_dep_functors{{.*}}
 // CHECK-NEXT: entry:
-// CHECK:        %call = call noundef i32 {{.*}}DepFunctor{{.*}}(float noundef 1.000000e+00)
-// CHECK:        %call1 = call noundef i32 {{.*}}DepFunctor{{.*}}(i1 noundef zeroext true)
-// CHECK:        %call2 = call noundef i32 {{.*}}dep_lambda1{{.*}}(float noundef 1.000000e+00)
-// CHECK:        %call3 = call noundef i32 {{.*}}dep_lambda1{{.*}}(i1 noundef zeroext true)
-// CHECK:        %call4 = call noundef i32 {{.*}}dep_lambda2{{.*}}(float noundef 1.000000e+00)
-// CHECK:        %call5 = call noundef i32 {{.*}}dep_lambda2{{.*}}(i1 noundef zeroext true)
+// CHECK:        {{.*}} = call noundef i32 {{.*}}DepFunctor{{.*}}(float noundef 1.000000e+00)
+// CHECK:        {{.*}} = call noundef i32 {{.*}}DepFunctor{{.*}}(i1 noundef zeroext true)
+// CHECK:        {{.*}}call {{.*}}dep_lambda1{{.*}}()
+// CHECK:        {{.*}} = call noundef i32 {{.*}}dep_lambda1{{.*}}(float noundef 1.000000e+00)
+// CHECK:        {{.*}}call {{.*}}dep_lambda1{{.*}}()
+// CHECK:        {{.*}} = call noundef i32 {{.*}}dep_lambda1{{.*}}(i1 noundef zeroext true)
+// CHECK:        {{.*}}call {{.*}}dep_lambda2{{.*}}()
+// CHECK:        {{.*}} = call noundef i32 {{.*}}dep_lambda2{{.*}}(float noundef 1.000000e+00)
+// CHECK:        {{.*}}call {{.*}}dep_lambda2{{.*}}()
+// CHECK:        {{.*}} = call noundef i32 {{.*}}dep_lambda2{{.*}}(i1 noundef zeroext true)
 // CHECK:        ret void
 // CHECK-NEXT: }
 
diff --git a/clang/test/CodeGenCXX/cxx2b-static-subscript-operator.cpp b/clang/test/CodeGenCXX/cxx2b-static-subscript-operator.cpp
index 5dbd2c50cc56bd..5d8258978c50d5 100644
--- a/clang/test/CodeGenCXX/cxx2b-static-subscript-operator.cpp
+++ b/clang/test/CodeGenCXX/cxx2b-static-subscript-operator.cpp
@@ -7,12 +7,17 @@ struct Functor {
   }
 };
 
+Functor GetAFunctor() {
+  return {};
+}
+
 void call_static_subscript_operator() {
   Functor f;
   f[101, 102];
   f.operator[](201, 202);
   Functor{}[301, 302];
   Functor::operator[](401, 402);
+  GetAFunctor()[501, 502];
 }
 
 // CHECK:      define {{.*}}call_static_subscript_operator{{.*}}
@@ -21,6 +26,8 @@ void call_static_subscript_operator() {
 // CHECK-NEXT:   {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 201, i32 noundef 202)
 // CHECK-NEXT:   {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 301, i32 noundef 302)
 // CHECK-NEXT:   {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 401, i32 noundef 402)
+// CHECK:        {{.*}}call {{.*}}GetAFunctor{{.*}}()
+// CHECK-NEXT:   {{.*}} = call noundef i32 {{.*}}Functor{{.*}}(i32 noundef 501, i32 noundef 502)
 // CHECK-NEXT:   ret void
 // CHECK-NEXT: }
 
@@ -60,7 +67,7 @@ void test_dep_functors() {
 
 // CHECK:      define {{.*}}test_dep_functors{{.*}}
 // CHECK-NEXT: entry:
-// CHECK:        %call = call noundef i32 {{.*}}DepFunctor{{.*}}(float noundef 1.000000e+00)
-// CHECK:        %call1 = call noundef i32 {{.*}}DepFunctor{{.*}}(i1 noundef zeroext true)
+// CHECK:        {{.*}} = call noundef i32 {{.*}}DepFunctor{{.*}}(float noundef 1.000000e+00)
+// CHECK:        {{.*}} = call noundef i32 {{.*}}DepFunctor{{.*}}(i1 noundef zeroext true)
 // CHECK:        ret void
 // CHECK-NEXT: }
diff --git a/clang/test/CodeGenCXX/riscv-mangle-rvv-fixed-vectors.cpp b/clang/test/CodeGenCXX/riscv-mangle-rvv-fixed-vectors.cpp
index 32bd49f4ff725d..c9e7313a021a5e 100644
--- a/clang/test/CodeGenCXX/riscv-mangle-rvv-fixed-vectors.cpp
+++ b/clang/test/CodeGenCXX/riscv-mangle-rvv-fixed-vectors.cpp
@@ -85,6 +85,14 @@ typedef __rvv_float16m8_t vfloat16m8_t;
 typedef __rvv_float32m8_t vfloat32m8_t;
 typedef __rvv_float64m8_t vfloat64m8_t;
 
+typedef __rvv_bool1_t vbool1_t;
+typedef __rvv_bool2_t vbool2_t;
+typedef __rvv_bool4_t vbool4_t;
+typedef __rvv_bool8_t vbool8_t;
+typedef __rvv_bool16_t vbool16_t;
+typedef __rvv_bool32_t vbool32_t;
+typedef __rvv_bool64_t vbool64_t;
+
 typedef vint8mf8_t fixed_int8mf8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/8)));
 
 typedef vuint8mf8_t fixed_uint8mf8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/8)));
@@ -164,6 +172,20 @@ typedef vfloat16m8_t fixed_float16m8_t __attribute__((riscv_rvv_vector_bits(__ri
 typedef vfloat32m8_t fixed_float32m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen*8)));
 typedef vfloat64m8_t fixed_float64m8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen*8)));
 
+typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool2_t fixed_bool2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/2)));
+typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/4)));
+typedef vbool8_t fixed_bool8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/8)));
+#if __riscv_v_fixed_vlen >= 128
+typedef vbool16_t fixed_bool16_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/16)));
+#endif
+#if __riscv_v_fixed_vlen >= 256
+typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/32)));
+#endif
+#if __riscv_v_fixed_vlen >= 512
+typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen/64)));
+#endif
+
 template <typename T> struct S {};
 
 // CHECK-64: _Z2f11SI9__RVV_VLSIu14__rvv_int8m1_tLj64EEE
@@ -578,3 +600,53 @@ void mf8f1(S<fixed_int8mf8_t>) {}
 // CHECK-512: _Z5mf8f51SI9__RVV_VLSIu16__rvv_uint8mf8_tLj64EEE
 // CHECK-1024: _Z5mf8f51SI9__RVV_VLSIu16__rvv_uint8mf8_tLj128EEE
 void mf8f5(S<fixed_uint8mf8_t>) {}
+
+// CHECK-64: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj64EEE
+// CHECK-128: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj128EEE
+// CHECK-256: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj256EEE
+// CHECK-512: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj512EEE
+// CHECK-1024: _Z5bool11SI9__RVV_VLSIu13__rvv_bool1_tLj1024EEE
+void bool1(S<fixed_bool1_t>) {}
+
+// CHECK-64: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj32EEE
+// CHECK-128: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj64EEE
+// CHECK-256: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj128EEE
+// CHECK-512: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj256EEE
+// CHECK-1024: _Z5bool21SI9__RVV_VLSIu13__rvv_bool2_tLj512EEE
+void bool2(S<fixed_bool2_t>) {}
+
+// CHECK-64: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj16EEE
+// CHECK-128: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj32EEE
+// CHECK-256: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj64EEE
+// CHECK-512: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj128EEE
+// CHECK-1024: _Z5bool41SI9__RVV_VLSIu13__rvv_bool4_tLj256EEE
+void bool4(S<fixed_bool4_t>) {}
+
+// CHECK-64: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj8EEE
+// CHECK-128: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj16EEE
+// CHECK-256: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj32EEE
+// CHECK-512: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj64EEE
+// CHECK-1024: _Z5bool81SI9__RVV_VLSIu13__rvv_bool8_tLj128EEE
+void bool8(S<fixed_bool8_t>) {}
+
+#if __riscv_v_fixed_vlen >= 128
+// CHECK-128: _Z6bool161SI9__RVV_VLSIu14__rvv_bool16_tLj8EEE
+// CHECK-256: _Z6bool161SI9__RVV_VLSIu14__rvv_bool16_tLj16EEE
+// CHECK-512: _Z6bool161SI9__RVV_VLSIu14__rvv_bool16_tLj32EEE
+// CHECK-1024: _Z6bool161SI9__RVV_VLSIu14__rvv_bool16_tLj64EEE
+//
+void bool16(S<fixed_bool16_t>) {}
+#endif
+
+#if __riscv_v_fixed_vlen >= 256
+// CHECK-256: _Z6bool321SI9__RVV_VLSIu14__rvv_bool32_tLj8EEE
+// CHECK-512: _Z6bool321SI9__RVV_VLSIu14__rvv_bool32_tLj16EEE
+// CHECK-1024: _Z6bool321SI9__RVV_VLSIu14__rvv_bool32_tLj32EEE
+void bool32(S<fixed_bool32_t>) {}
+#endif
+
+#if __riscv_v_fixed_vlen >= 512
+// CHECK-512: _Z6bool641SI9__RVV_VLSIu14__rvv_bool64_tLj8EEE
+// CHECK-1024: _Z6bool641SI9__RVV_VLSIu14__rvv_bool64_tLj16EEE
+void bool64(S<fixed_bool64_t>) {}
+#endif
diff --git a/clang/test/CodeGenObjCXX/msabi-stret-arm64.mm b/clang/test/CodeGenObjCXX/msabi-stret-arm64.mm
new file mode 100644
index 00000000000000..3bbdbebc5cb576
--- /dev/null
+++ b/clang/test/CodeGenObjCXX/msabi-stret-arm64.mm
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -triple aarch64-pc-windows-msvc -fobjc-runtime=gnustep-2.2 -fobjc-dispatch-method=non-legacy -emit-llvm -o - %s | FileCheck %s
+
+// Pass and return for type size <= 8 bytes.
+struct S1 {
+  int a[2];
+};
+
+// Pass and return hfa <= 8 bytes
+struct F1 {
+  float a[2];
+};
+
+// Pass and return for type size > 16 bytes.
+struct S2 {
+  int a[5];
+};
+
+// Pass and return aggregate (of size < 16 bytes) with non-trivial destructor.
+// Sret and inreg: Returned in x0
+struct S3 {
+  int a[3];
+  ~S3();
+};
+S3::~S3() {
+}
+
+
+@interface MsgTest { id isa; } @end
+@implementation MsgTest 
+- (S1) smallS1 {
+  S1 x;
+  x.a[0] = 0;
+  x.a[1] = 1;
+  return x;
+
+}
+- (F1) smallF1 {
+  F1 x;
+  x.a[0] = 0.2f;
+  x.a[1] = 0.5f;
+  return x;
+}
+- (S2) stretS2 {
+  S2 x;
+  for (int i = 0; i < 5; i++) {
+    x.a[i] = i;
+  }
+  return x;
+}
+- (S3) stretInRegS3 {
+  S3 x;
+  for (int i = 0; i < 3; i++) {
+    x.a[i] = i;
+  }
+  return x;
+}
++ (S3) msgTestStretInRegS3 {
+  S3 x;
+  for (int i = 0; i < 3; i++) {
+    x.a[i] = i;
+  }
+  return x;
+}
+@end
+
+void test0(MsgTest *t) {
+    // CHECK: call {{.*}} @objc_msgSend
+    S1 ret = [t smallS1];
+    // CHECK: call {{.*}} @objc_msgSend
+    F1 ret2 = [t smallF1];
+    // CHECK: call {{.*}} @objc_msgSend_stret
+    S2 ret3 = [t stretS2];
+    // CHECK: call {{.*}} @objc_msgSend_stret2
+    S3 ret4 = [t stretInRegS3];
+    // CHECK: call {{.*}} @objc_msgSend_stret2
+    S3 ret5 = [MsgTest msgTestStretInRegS3];
+}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 1ba2b129f6895a..9c8ca0bb96f612 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -100,8 +100,8 @@
 // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
index 6c84485b66b4a0..edf6dbf8657cbe 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
@@ -30,14 +30,7 @@ __kernel void test_printf_int(int i) {
 // CHECK-NEXT:    [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5)
 // CHECK-NEXT:    store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5:[0-9]+]]
-// CHECK-NEXT:    [[LOC0:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 0
-// CHECK-NEXT:    store i8 102, ptr addrspace(5) [[LOC0]], align 1
-// CHECK-NEXT:    [[LOC1:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 1
-// CHECK-NEXT:    store i8 111, ptr addrspace(5) [[LOC1]], align 1
-// CHECK-NEXT:    [[LOC2:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 2
-// CHECK-NEXT:    store i8 111, ptr addrspace(5) [[LOC2]], align 1
-// CHECK-NEXT:    [[LOC3:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 3
-// CHECK-NEXT:    store i8 0, ptr addrspace(5) [[LOC3]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false)
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP2]]) #[[ATTR4]]
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
index 56d757012a5e78..4e3a56b4201bb6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
@@ -1,59 +1,60 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940  -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
 
 typedef float  v2f   __attribute__((ext_vector_type(2)));
 
-// CHECK-GFX940-LABEL: @test_cvt_f32_bf8
-// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
+// CHECK-LABEL: @test_cvt_f32_bf8
+// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
 void test_cvt_f32_bf8(global int* out, int a)
 {
   *out = __builtin_amdgcn_cvt_f32_bf8(a, 0);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_f32_fp8
-// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
+// CHECK-LABEL: @test_cvt_f32_fp8
+// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
 void test_cvt_f32_fp8(global int* out, int a)
 {
   *out = __builtin_amdgcn_cvt_f32_fp8(a, 1);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8
-// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
+// CHECK-LABEL: @test_cvt_pk_f32_bf8
+// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
 void test_cvt_pk_f32_bf8(global v2f* out, int a)
 {
   *out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8
-// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
+// CHECK-LABEL: @test_cvt_pk_f32_fp8
+// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
 void test_cvt_pk_f32_fp8(global v2f* out, int a)
 {
   *out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32
-// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
+// CHECK-LABEL: @test_cvt_pk_bf8_f32
+// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
 void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b)
 {
   *out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32
-// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
+// CHECK-LABEL: @test_cvt_pk_fp8_f32
+// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
 void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b)
 {
   *out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32
-// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
+// CHECK-LABEL: @test_cvt_sr_bf8_f32
+// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
 void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b)
 {
   *out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32
-// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
+// CHECK-LABEL: @test_cvt_sr_fp8_f32
+// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
 void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b)
 {
   *out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
new file mode 100644
index 00000000000000..11747af7ea74f5
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -0,0 +1,156 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+// Wave32
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
new file mode 100644
index 00000000000000..ef32648743ca62
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -0,0 +1,155 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+
+// Wave64
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
new file mode 100644
index 00000000000000..b303c2f25dddac
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
@@ -0,0 +1,135 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+typedef half  v16h   __attribute__((ext_vector_type(16)));
+typedef short v16s   __attribute__((ext_vector_type(16)));
+
+// Wave32
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
new file mode 100644
index 00000000000000..855fa7351e1556
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+
+// Wave64
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x half> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
index 41a78ae268be57..49cb797df42331 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
@@ -16,7 +16,7 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 
 // Wave32
 
-void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f, 
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f,
                                             global v16h* out16h, v16h a16h, v16h b16h, v16h c16h,
                                             global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
                                             global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
index 4f13c75e5e81f9..3c6aaf5e38281d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -2,7 +2,6 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
 
-typedef float  v4f   __attribute__((ext_vector_type(4)));
 typedef float  v8f   __attribute__((ext_vector_type(8)));
 typedef half   v16h  __attribute__((ext_vector_type(16)));
 typedef int    v2i   __attribute__((ext_vector_type(2)));
@@ -20,7 +19,7 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
 // CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -35,7 +34,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
 // CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -50,7 +49,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -65,7 +64,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -80,7 +79,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -95,7 +94,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -110,7 +109,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
 // CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -125,7 +124,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
 // CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
index 4797675f50d42e..1490f14fd17b69 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -3,12 +3,10 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
 
 typedef float  v4f   __attribute__((ext_vector_type(4)));
-typedef float  v8f   __attribute__((ext_vector_type(8)));
 typedef half   v8h   __attribute__((ext_vector_type(8)));
 typedef half   v16h  __attribute__((ext_vector_type(16)));
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef int    v4i   __attribute__((ext_vector_type(4)));
-typedef int    v8i   __attribute__((ext_vector_type(8)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 
@@ -22,7 +20,7 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
 // CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -37,7 +35,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
 // CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -52,7 +50,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -67,7 +65,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -82,7 +80,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -97,7 +95,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b,
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -112,7 +110,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
 // CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -127,7 +125,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
 // CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
diff --git a/clang/test/CoverageMapping/if.cpp b/clang/test/CoverageMapping/if.cpp
index 3045ffe43948cb..445cdfc20e2aff 100644
--- a/clang/test/CoverageMapping/if.cpp
+++ b/clang/test/CoverageMapping/if.cpp
@@ -234,6 +234,35 @@ constexpr int check_macro_consteval_if_skipped(int i) {   // CHECK-NEXT: [[@LINE
   return i;
 }
 
+struct false_value {
+  constexpr operator bool() {
+    return false;
+  }
+};
+
+template <typename> struct dependable_false_value {
+  constexpr operator bool() {
+    return false;
+  }
+};
+
+// GH-80285
+void should_not_crash() {
+  if constexpr (false_value{}) { };
+}
+
+template <typename> void should_not_crash_dependable() {
+  if constexpr (dependable_false_value<int>{}) { };
+}
+
+void should_not_crash_with_template_instance() {
+  should_not_crash_dependable<int>();
+}
+
+void should_not_crash_with_requires_expr() {
+  if constexpr (requires {42;}) { };
+}
+
 int instantiate_consteval(int i) {
   i *= check_consteval_with_else_discarded_then(i);
   i *= check_notconsteval_with_else_discarded_else(i);
diff --git a/clang/test/CoverageMapping/statement-expression.c b/clang/test/CoverageMapping/statement-expression.c
new file mode 100644
index 00000000000000..5f9ab5838af342
--- /dev/null
+++ b/clang/test/CoverageMapping/statement-expression.c
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name statement-expression.c %s
+
+// No crash for the following examples, where GNU Statement Expression extension
+// could introduce region terminators (break, goto etc) before implicit
+// initializers in a struct or an array.
+// See https://github.com/llvm/llvm-project/pull/89564
+
+struct Foo {
+  int field1;
+  int field2;
+};
+
+void f1(void) {
+  struct Foo foo = {
+    .field1 = ({
+      switch (0) {
+      case 0:
+        break; // A region terminator
+      }
+      0;
+    }),
+    // ImplicitValueInitExpr introduced here for .field2
+  };
+}
+
+void f2(void) {
+  int arr[3] = {
+    [0] = ({
+        goto L0; // A region terminator
+L0:
+      0;
+    }),
+    // ImplicitValueInitExpr introduced here for subscript [1]
+    [2] = 0,
+  };
+}
diff --git a/clang/test/CoverageMapping/templates.cpp b/clang/test/CoverageMapping/templates.cpp
index 7010edbc32c34a..143e566a33cb85 100644
--- a/clang/test/CoverageMapping/templates.cpp
+++ b/clang/test/CoverageMapping/templates.cpp
@@ -19,3 +19,16 @@ int main() {
   func<bool>(true);
   return 0;
 }
+
+namespace structural_value_crash {
+  template <int* p>
+  void tpl_fn() {
+    (void)p;
+  }
+
+  int arr[] = {1, 2, 3};
+
+  void test() {
+    tpl_fn<arr>();
+  }
+}
diff --git a/clang/test/Driver/aarch64-cssc.c b/clang/test/Driver/aarch64-cssc.c
index a3e18663279bbd..5df0ea79d7c850 100644
--- a/clang/test/Driver/aarch64-cssc.c
+++ b/clang/test/Driver/aarch64-cssc.c
@@ -9,6 +9,7 @@
 // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a        %s 2>&1 | FileCheck %s
 // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+cssc   %s 2>&1 | FileCheck %s
 // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+nocssc %s 2>&1 | FileCheck %s --check-prefix=NO_CSSC
+// RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -mcpu=ampere1b          %s 2>&1 | FileCheck %s
 
 // CHECK: "target-features"="{{.*}},+cssc
 // NO_CSSC: "target-features"="{{.*}},-cssc
diff --git a/clang/test/Driver/aarch64-mcpu.c b/clang/test/Driver/aarch64-mcpu.c
index 511482a420da26..3e07f3597f3408 100644
--- a/clang/test/Driver/aarch64-mcpu.c
+++ b/clang/test/Driver/aarch64-mcpu.c
@@ -72,6 +72,9 @@
 // RUN: %clang --target=aarch64 -mcpu=cortex-r82  -### -c %s 2>&1 | FileCheck -check-prefix=CORTEXR82 %s
 // CORTEXR82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-r82"
 
+// RUN: %clang --target=aarch64 -mcpu=cobalt-100 -### -c %s 2>&1 | FileCheck -check-prefix=COBALT-100 %s
+// COBALT-100: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n2"
+
 // RUN: %clang --target=aarch64 -mcpu=grace -### -c %s 2>&1 | FileCheck -check-prefix=GRACE %s
 // GRACE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v2"
 
diff --git a/clang/test/Driver/fat-lto-objects.c b/clang/test/Driver/fat-lto-objects.c
index 97002db6edc51e..d9a5ba88ea6d6f 100644
--- a/clang/test/Driver/fat-lto-objects.c
+++ b/clang/test/Driver/fat-lto-objects.c
@@ -23,11 +23,17 @@
 // CHECK-CC-S-EL-LTO-SAME: -emit-llvm
 // CHECK-CC-S-EL-LTO-SAME: -ffat-lto-objects
 
-/// When fat LTO is enabled wihtout -S we expect native object output and -ffat-lto-object to be passed to cc1.
+/// When fat LTO is enabled without -S we expect native object output and -ffat-lto-object to be passed to cc1.
 // RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC-C-LTO
 // CHECK-CC-C-LTO: -cc1
-// CHECK-CC-C-LTO: -emit-obj
-// CHECK-CC-C-LTO: -ffat-lto-objects
+// CHECK-CC-C-LTO-SAME: -emit-obj
+// CHECK-CC-C-LTO-SAME: -ffat-lto-objects
+
+/// When fat LTO is enabled with -c and -emit-llvm we expect bitcode output and -ffat-lto-object to be passed to cc1.
+// RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -c -emit-llvm 2>&1 | FileCheck %s -check-prefix=CHECK-CC-C-EL-LTO
+// CHECK-CC-C-EL-LTO: -cc1
+// CHECK-CC-C-EL-LTO-SAME: -emit-llvm-bc
+// CHECK-CC-C-EL-LTO-SAME: -ffat-lto-objects
 
 /// Make sure we don't have a warning for -ffat-lto-objects being unused
 // RUN: %clang --target=x86_64-unknown-linux-gnu -ffat-lto-objects -fdriver-only -Werror -v %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC-NOLTO
diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c
index e2a7619e9b89f7..8a230284bcdfe4 100644
--- a/clang/test/Driver/fveclib.c
+++ b/clang/test/Driver/fveclib.c
@@ -31,3 +31,21 @@
 
 // RUN: %clang -fveclib=Accelerate %s -nodefaultlibs -target arm64-apple-ios8.0.0 -### 2>&1 | FileCheck --check-prefix=CHECK-LINK-NODEFAULTLIBS %s
 // CHECK-LINK-NODEFAULTLIBS-NOT: "-framework" "Accelerate"
+
+
+/* Verify that the correct vector library is passed to LTO flags. */
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=LIBMVEC -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-LIBMVEC %s
+// CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC-X86"
+
+// RUN: %clang -### --target=powerpc64-unknown-linux-gnu -fveclib=MASSV -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-MASSV %s
+// CHECK-LTO-MASSV: "-plugin-opt=-vector-library=MASSV"
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=SVML -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SVML %s
+// CHECK-LTO-SVML: "-plugin-opt=-vector-library=SVML"
+
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=SLEEF -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SLEEF %s
+// CHECK-LTO-SLEEF: "-plugin-opt=-vector-library=sleefgnuabi"
+
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-ARMPL %s
+// CHECK-LTO-ARMPL: "-plugin-opt=-vector-library=ArmPL"
diff --git a/clang/test/Driver/modules-skip-odr-check-in-gmf.cpp b/clang/test/Driver/modules-skip-odr-check-in-gmf.cpp
new file mode 100644
index 00000000000000..b00b6d330ba459
--- /dev/null
+++ b/clang/test/Driver/modules-skip-odr-check-in-gmf.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang -std=c++20 -### -c %s 2>&1 | FileCheck %s
+// RUN: %clang -std=c++20 -fno-skip-odr-check-in-gmf -### -c %s 2>&1 \
+// RUN:     | FileCheck %s --check-prefix=UNUSED
+// RUN: %clang -std=c++20 -Xclang -fno-skip-odr-check-in-gmf -### -c %s 2>&1 \
+// RUN:     | FileCheck %s --check-prefix=NO-SKIP
+
+// CHECK: -fskip-odr-check-in-gmf
+// UNUSED: warning: argument unused during compilation: '-fno-skip-odr-check-in-gmf'
+// UNUSED-NOT: -fno-skip-odr-check-in-gmf
+// NO-SKIP: -fskip-odr-check-in-gmf{{.*}}-fno-skip-odr-check-in-gmf
diff --git a/clang/test/Driver/sparc-fixed-register.c b/clang/test/Driver/sparc-fixed-register.c
new file mode 100644
index 00000000000000..24880b9c9d86fd
--- /dev/null
+++ b/clang/test/Driver/sparc-fixed-register.c
@@ -0,0 +1,181 @@
+// RUN: %clang --target=sparc-none-gnu -ffixed-g1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G1 < %t %s
+// CHECK-FIXED-G1: "-target-feature" "+reserve-g1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G2 < %t %s
+// CHECK-FIXED-G2: "-target-feature" "+reserve-g2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G3 < %t %s
+// CHECK-FIXED-G3: "-target-feature" "+reserve-g3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G4 < %t %s
+// CHECK-FIXED-G4: "-target-feature" "+reserve-g4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G5 < %t %s
+// CHECK-FIXED-G5: "-target-feature" "+reserve-g5"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g6 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G6 < %t %s
+// CHECK-FIXED-G6: "-target-feature" "+reserve-g6"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g7 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G7 < %t %s
+// CHECK-FIXED-G7: "-target-feature" "+reserve-g7"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o0 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O0 < %t %s
+// CHECK-FIXED-O0: "-target-feature" "+reserve-o0"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O1 < %t %s
+// CHECK-FIXED-O1: "-target-feature" "+reserve-o1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O2 < %t %s
+// CHECK-FIXED-O2: "-target-feature" "+reserve-o2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O3 < %t %s
+// CHECK-FIXED-O3: "-target-feature" "+reserve-o3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O4 < %t %s
+// CHECK-FIXED-O4: "-target-feature" "+reserve-o4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O5 < %t %s
+// CHECK-FIXED-O5: "-target-feature" "+reserve-o5"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l0 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L0 < %t %s
+// CHECK-FIXED-L0: "-target-feature" "+reserve-l0"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L1 < %t %s
+// CHECK-FIXED-L1: "-target-feature" "+reserve-l1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L2 < %t %s
+// CHECK-FIXED-L2: "-target-feature" "+reserve-l2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L3 < %t %s
+// CHECK-FIXED-L3: "-target-feature" "+reserve-l3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L4 < %t %s
+// CHECK-FIXED-L4: "-target-feature" "+reserve-l4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L5 < %t %s
+// CHECK-FIXED-L5: "-target-feature" "+reserve-l5"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l6 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L6 < %t %s
+// CHECK-FIXED-L6: "-target-feature" "+reserve-l6"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l7 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L7 < %t %s
+// CHECK-FIXED-L7: "-target-feature" "+reserve-l7"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i0 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I0 < %t %s
+// CHECK-FIXED-I0: "-target-feature" "+reserve-i0"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I1 < %t %s
+// CHECK-FIXED-I1: "-target-feature" "+reserve-i1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I2 < %t %s
+// CHECK-FIXED-I2: "-target-feature" "+reserve-i2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I3 < %t %s
+// CHECK-FIXED-I3: "-target-feature" "+reserve-i3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I4 < %t %s
+// CHECK-FIXED-I4: "-target-feature" "+reserve-i4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I5 < %t %s
+// CHECK-FIXED-I5: "-target-feature" "+reserve-i5"
+
+// Test multiple of reserve-* options together.
+// RUN: %clang --target=sparc-none-gnu \
+// RUN: -ffixed-g1 \
+// RUN: -ffixed-o2 \
+// RUN: -ffixed-l3 \
+// RUN: -ffixed-i4 \
+// RUN: -### %s 2> %t
+// RUN: FileCheck \
+// RUN: --check-prefix=CHECK-FIXED-G1 \
+// RUN: --check-prefix=CHECK-FIXED-O2 \
+// RUN: --check-prefix=CHECK-FIXED-L3 \
+// RUN: --check-prefix=CHECK-FIXED-I4 \
+// RUN: < %t %s
+
+// Test all reserve-* options together.
+// RUN: %clang --target=sparc-none-gnu \
+// RUN: -ffixed-g1 \
+// RUN: -ffixed-g2 \
+// RUN: -ffixed-g3 \
+// RUN: -ffixed-g4 \
+// RUN: -ffixed-g5 \
+// RUN: -ffixed-g6 \
+// RUN: -ffixed-g7 \
+// RUN: -ffixed-o0 \
+// RUN: -ffixed-o1 \
+// RUN: -ffixed-o2 \
+// RUN: -ffixed-o3 \
+// RUN: -ffixed-o4 \
+// RUN: -ffixed-o5 \
+// RUN: -ffixed-l0 \
+// RUN: -ffixed-l1 \
+// RUN: -ffixed-l2 \
+// RUN: -ffixed-l3 \
+// RUN: -ffixed-l4 \
+// RUN: -ffixed-l5 \
+// RUN: -ffixed-l6 \
+// RUN: -ffixed-l7 \
+// RUN: -ffixed-i0 \
+// RUN: -ffixed-i1 \
+// RUN: -ffixed-i2 \
+// RUN: -ffixed-i3 \
+// RUN: -ffixed-i4 \
+// RUN: -ffixed-i5 \
+// RUN: -### %s 2> %t
+// RUN: FileCheck \
+// RUN: --check-prefix=CHECK-FIXED-G1 \
+// RUN: --check-prefix=CHECK-FIXED-G2 \
+// RUN: --check-prefix=CHECK-FIXED-G3 \
+// RUN: --check-prefix=CHECK-FIXED-G4 \
+// RUN: --check-prefix=CHECK-FIXED-G5 \
+// RUN: --check-prefix=CHECK-FIXED-G6 \
+// RUN: --check-prefix=CHECK-FIXED-G7 \
+// RUN: --check-prefix=CHECK-FIXED-O0 \
+// RUN: --check-prefix=CHECK-FIXED-O1 \
+// RUN: --check-prefix=CHECK-FIXED-O2 \
+// RUN: --check-prefix=CHECK-FIXED-O3 \
+// RUN: --check-prefix=CHECK-FIXED-O4 \
+// RUN: --check-prefix=CHECK-FIXED-O5 \
+// RUN: --check-prefix=CHECK-FIXED-L0 \
+// RUN: --check-prefix=CHECK-FIXED-L1 \
+// RUN: --check-prefix=CHECK-FIXED-L2 \
+// RUN: --check-prefix=CHECK-FIXED-L3 \
+// RUN: --check-prefix=CHECK-FIXED-L4 \
+// RUN: --check-prefix=CHECK-FIXED-L5 \
+// RUN: --check-prefix=CHECK-FIXED-L6 \
+// RUN: --check-prefix=CHECK-FIXED-L7 \
+// RUN: --check-prefix=CHECK-FIXED-I0 \
+// RUN: --check-prefix=CHECK-FIXED-I1 \
+// RUN: --check-prefix=CHECK-FIXED-I2 \
+// RUN: --check-prefix=CHECK-FIXED-I3 \
+// RUN: --check-prefix=CHECK-FIXED-I4 \
+// RUN: --check-prefix=CHECK-FIXED-I5 \
+// RUN: < %t %s
diff --git a/clang/test/Driver/sparc64-codemodel.c b/clang/test/Driver/sparc64-codemodel.c
new file mode 100644
index 00000000000000..e4b01fd61b6fac
--- /dev/null
+++ b/clang/test/Driver/sparc64-codemodel.c
@@ -0,0 +1,6 @@
+// RUN: %clang --target=sparc64 -mcmodel=medlow %s -### 2>&1 | FileCheck -check-prefix=MEDLOW %s
+// RUN: %clang --target=sparc64 -mcmodel=medmid %s -### 2>&1 | FileCheck -check-prefix=MEDMID %s
+// RUN: %clang --target=sparc64 -mcmodel=medany %s -### 2>&1 | FileCheck -check-prefix=MEDANY %s
+// MEDLOW: "-mcmodel=small"
+// MEDMID: "-mcmodel=medium"
+// MEDANY: "-mcmodel=large"
diff --git a/clang/test/Driver/tls-dialect.c b/clang/test/Driver/tls-dialect.c
new file mode 100644
index 00000000000000..4e105ce3cea5d9
--- /dev/null
+++ b/clang/test/Driver/tls-dialect.c
@@ -0,0 +1,25 @@
+// RUN: %clang -### --target=riscv64-freebsd -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=DESC %s
+// RUN: %clang -### --target=riscv64-linux -mtls-dialect=trad %s 2>&1 | FileCheck --check-prefix=NODESC %s
+// RUN: %clang -### --target=riscv64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s
+// RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu %s 2>&1 | FileCheck --check-prefix=NODESC %s
+
+/// LTO
+// RUN: %clang -### --target=riscv64-linux -flto -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=LTO-DESC %s
+// RUN: %clang -### --target=riscv64-linux -flto %s 2>&1 | FileCheck --check-prefix=LTO-NODESC %s
+
+/// Unsupported target
+/// GCC supports -mtls-dialect= for AArch64, but we just unsupport it for AArch64 as it is very rarely used.
+// RUN: not %clang --target=aarch64-linux -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-TARGET %s
+// RUN: not %clang --target=x86_64-apple-macos -mtls-dialect=desc -flto %s 2>&1 | FileCheck -check-prefix=UNSUPPORTED-TARGET %s
+
+/// Unsupported argument
+// RUN: not %clang -### --target=riscv64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s
+// RUN: not %clang -### --target=x86_64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s
+
+// DESC:       "-cc1" {{.*}}"-enable-tlsdesc"
+// NODESC-NOT: "-enable-tlsdesc"
+// LTO-DESC:       "-plugin-opt=-enable-tlsdesc"
+// LTO-NODESC-NOT: "-plugin-opt=-enable-tlsdesc"
+
+// UNSUPPORTED-TARGET: error: unsupported option '-mtls-dialect=' for target
+// UNSUPPORTED-ARG: error: unsupported argument 'gnu2' to option '-mtls-dialect=' for target
diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c
index f950283ec42aa0..88590a3ba4c453 100644
--- a/clang/test/Driver/wasm-toolchain.c
+++ b/clang/test/Driver/wasm-toolchain.c
@@ -197,3 +197,27 @@
 // RUN: not %clang -### %s --target=wasm32-unknown-unknown --sysroot=%s/no-sysroot-there -fPIC -mno-mutable-globals %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=PIC_NO_MUTABLE_GLOBALS %s
 // PIC_NO_MUTABLE_GLOBALS: error: invalid argument '-fPIC' not allowed with '-mno-mutable-globals'
+
+// Test that `wasm32-wasip2` invokes the `wasm-component-ld` linker by default
+// instead of `wasm-ld`.
+
+// RUN: %clang -### -O2 --target=wasm32-wasip2 %s --sysroot /foo 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK_WASIP2 %s
+// LINK_WASIP2: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_WASIP2: wasm-component-ld{{.*}}" "-L/foo/lib/wasm32-wasip2" "crt1.o" "[[temp]]" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
+
+// Test that on `wasm32-wasip2` the `wasm-component-ld` programs is told where
+// to find `wasm-ld` by default.
+
+// RUN: %clang -### -O2 --target=wasm32-wasip2 %s --sysroot /foo 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK_WASIP2_FIND_WASMLD %s
+// LINK_WASIP2_FIND_WASMLD: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_WASIP2_FIND_WASMLD: wasm-component-ld{{.*}}" {{.*}} "--wasm-ld-path" "{{.*}}wasm-ld{{.*}}" {{.*}} "[[temp]]" {{.*}}
+
+// If `wasm32-wasip2` is configured with `wasm-ld` as a linker then don't pass
+// the `--wasm-ld-path` flag.
+
+// RUN: %clang -### -O2 --target=wasm32-wasip2 -fuse-ld=lld %s --sysroot /foo 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK_WASIP2_USE_WASMLD %s
+// LINK_WASIP2_USE_WASMLD: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_WASIP2_USE_WASMLD: wasm-ld{{.*}}" "-m" "wasm32" {{.*}} "[[temp]]" {{.*}}
diff --git a/clang/test/Format/clang-format-ignore.cpp b/clang/test/Format/clang-format-ignore.cpp
index b4e526463000ae..fb49fa9dd52c65 100644
--- a/clang/test/Format/clang-format-ignore.cpp
+++ b/clang/test/Format/clang-format-ignore.cpp
@@ -1,4 +1,3 @@
-// UNSUPPORTED: system-windows
 // RUN: rm -rf %t.dir
 // RUN: mkdir -p %t.dir/level1/level2
 
diff --git a/clang/test/Format/dump-config-objc-stdin.m b/clang/test/Format/dump-config-objc-stdin.m
index b22ff7b3328caa..d81711a84d79bf 100644
--- a/clang/test/Format/dump-config-objc-stdin.m
+++ b/clang/test/Format/dump-config-objc-stdin.m
@@ -1,5 +1,8 @@
+// RUN: clang-format -assume-filename=foo.m -dump-config | FileCheck %s
+
 // RUN: clang-format -dump-config - < %s | FileCheck %s
 
 // CHECK: Language: ObjC
+
 @interface Foo
 @end
diff --git a/clang/test/Format/verbose.cpp b/clang/test/Format/verbose.cpp
index dd625e3f67e55d..4ab03d8f62aefc 100644
--- a/clang/test/Format/verbose.cpp
+++ b/clang/test/Format/verbose.cpp
@@ -1,12 +1,6 @@
-// RUN: clang-format %s  2> %t.stderr
+// RUN: clang-format -verbose 2> %t.stderr
 // RUN: not grep "Formatting" %t.stderr
-// RUN: clang-format %s -verbose 2> %t.stderr
-// RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr
-// RUN: clang-format %s -verbose=false 2> %t.stderr
-// RUN: not grep "Formatting" %t.stderr
-
-int a;
-// RUN: clang-format %s  2> %t.stderr
+// RUN: clang-format %s 2> %t.stderr
 // RUN: not grep "Formatting" %t.stderr
 // RUN: clang-format %s -verbose 2> %t.stderr
 // RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 84aed5c9c36fe4..39ed02f50950dd 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -5,11 +5,11 @@
 
 // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64
 // AARCH64: error: unknown target CPU 'not-a-cpu'
-// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, grace{{$}}
+// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64
 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu'
-// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, grace{{$}}
+// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86
 // X86: error: unknown target CPU 'not-a-cpu'
diff --git a/clang/test/Modules/concept.cppm b/clang/test/Modules/concept.cppm
index 0e85a46411a544..0fdb5ea8968085 100644
--- a/clang/test/Modules/concept.cppm
+++ b/clang/test/Modules/concept.cppm
@@ -5,6 +5,12 @@
 // RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm
 // RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t -I%t -DDIFFERENT %t/B.cppm -verify
 // RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t -I%t %t/B.cppm -verify
+//
+// Testing the behavior of `-fskip-odr-check-in-gmf`
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf -fprebuilt-module-path=%t -I%t  \
+// RUN:    -DDIFFERENT -DSKIP_ODR_CHECK_IN_GMF %t/B.cppm -verify
+
 
 //--- foo.h
 #ifndef FOO_H
@@ -70,7 +76,10 @@ module;
 export module B;
 import A;
 
-#ifdef DIFFERENT
+#ifdef SKIP_ODR_CHECK_IN_GMF
+// expected-error@B.cppm:* {{call to object of type '__fn' is ambiguous}}
+// expected-note@* 1+{{candidate function}}
+#elif defined(DIFFERENT)
 // expected-error@foo.h:41 {{'__fn::operator()' from module 'A.<global>' is not present in definition of '__fn' provided earlier}}
 // expected-note@* 1+{{declaration of 'operator()' does not match}}
 #else
diff --git a/clang/test/Modules/cxx20-modules-enum-odr.cppm b/clang/test/Modules/cxx20-modules-enum-odr.cppm
new file mode 100644
index 00000000000000..831c01143a27ba
--- /dev/null
+++ b/clang/test/Modules/cxx20-modules-enum-odr.cppm
@@ -0,0 +1,51 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/mod1.cppm -emit-module-interface -o %t/mod1.pcm
+// RUN: %clang_cc1 -std=c++20 %t/mod2.cppm -emit-module-interface -o %t/mod2.pcm
+// RUN: %clang_cc1 -std=c++20 %t/test.cpp -fprebuilt-module-path=%t -verify -fsyntax-only
+
+//--- size_t.h
+
+extern "C" {
+    typedef unsigned int size_t;
+}
+
+//--- csize_t
+namespace std {
+            using :: size_t;
+}
+
+//--- align.h
+namespace std {
+    enum class align_val_t : size_t {};
+}
+
+//--- mod1.cppm
+module;
+#include "size_t.h"
+#include "align.h"
+export module mod1;
+namespace std {
+export using std::align_val_t;
+}
+
+//--- mod2.cppm
+module;
+#include "size_t.h"
+#include "csize_t"
+#include "align.h"
+export module mod2;
+namespace std {
+export using std::align_val_t;
+}
+
+//--- test.cpp
+// expected-no-diagnostics
+import mod1;
+import mod2;
+void test() {
+    std::align_val_t v;
+}
+
diff --git a/clang/test/Modules/hashing-decls-in-exprs-from-gmf.cppm b/clang/test/Modules/hashing-decls-in-exprs-from-gmf.cppm
new file mode 100644
index 00000000000000..8db53c0ace8796
--- /dev/null
+++ b/clang/test/Modules/hashing-decls-in-exprs-from-gmf.cppm
@@ -0,0 +1,67 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/B.cppm -emit-module-interface -o %t/B.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/test.cpp -fprebuilt-module-path=%t -fsyntax-only -verify
+
+//--- header.h
+#pragma once
+template <class _Tp>
+class Optional {};
+
+template <class _Tp>
+concept C = requires(const _Tp& __t) {
+    []<class _Up>(const Optional<_Up>&) {}(__t);
+};
+
+//--- func.h
+#include "header.h"
+template <C T>
+void func() {}
+
+//--- duplicated_func.h
+#include "header.h"
+template <C T>
+void duplicated_func() {}
+
+//--- test_func.h
+#include "func.h"
+
+void test_func() {
+    func<Optional<int>>();
+}
+
+//--- test_duplicated_func.h
+#include "duplicated_func.h"
+
+void test_duplicated_func() {
+    duplicated_func<Optional<int>>();
+}
+
+//--- A.cppm
+module;
+#include "header.h"
+#include "test_duplicated_func.h"
+export module A;
+export using ::test_duplicated_func;
+
+//--- B.cppm
+module;
+#include "header.h"
+#include "test_func.h"
+#include "test_duplicated_func.h"
+export module B;
+export using ::test_func;
+export using ::test_duplicated_func;
+
+//--- test.cpp
+// expected-no-diagnostics
+import A;
+import B;
+
+void test() {
+    test_func();
+    test_duplicated_func();
+}
diff --git a/clang/test/Modules/no-eager-load.cppm b/clang/test/Modules/no-eager-load.cppm
index 6632cc60c8eb84..8a2c7656bca2b4 100644
--- a/clang/test/Modules/no-eager-load.cppm
+++ b/clang/test/Modules/no-eager-load.cppm
@@ -9,19 +9,10 @@
 // RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %t/d.cpp \
 // RUN:     -fprebuilt-module-path=%t
 //
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/e.cppm -o %t/e.pcm
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/f.cppm -o %t/f.pcm
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %t/g.cpp \
-// RUN:     -fprebuilt-module-path=%t
-//
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/h.cppm \
 // RUN:     -fprebuilt-module-path=%t -o %t/h.pcm
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/i.cppm \
-// RUN:     -fprebuilt-module-path=%t -o %t/i.pcm
 // RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %t/j.cpp \
 // RUN:     -fprebuilt-module-path=%t
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %t/k.cpp \
-// RUN:     -fprebuilt-module-path=%t
 
 //--- a.cppm
 export module a;
@@ -53,58 +44,14 @@ void use() {
            // expected-note@* {{but in 'a' found a different body}}
 }
 
-//--- foo.h
-void foo() {
-
-}
-
-//--- bar.h
-void bar();
-void foo() {
-    bar();
-}
-
-//--- e.cppm
-module;
-#include "foo.h"
-export module e;
-export using ::foo;
-
-//--- f.cppm
-module;
-#include "bar.h"
-export module f;
-export using ::foo;
-
-//--- g.cpp
-import e;
-import f;
-void use() {
-    foo(); // expected-error@* {{'foo' has different definitions in different modules;}}
-           // expected-note@* {{but in 'e.<global>' found a different body}}
-}
-
 //--- h.cppm
 export module h;
 export import a;
 export import b;
 
-//--- i.cppm
-export module i;
-export import e;
-export import f;
-
 //--- j.cpp
 import h;
 void use() {
     foo(); // expected-error@* {{'foo' has different definitions in different modules;}}
            // expected-note@* {{but in 'a' found a different body}}
 }
-
-//--- k.cpp
-import i;
-void use() {
-    foo(); // expected-error@* {{'foo' has different definitions in different modules;}}
-           // expected-note@* {{but in 'e.<global>' found a different body}}
-}
-
diff --git a/clang/test/Modules/no-undeclared-includes-builtins.cpp b/clang/test/Modules/no-undeclared-includes-builtins.cpp
index c9bffc55619905..f9eefd24a33c7d 100644
--- a/clang/test/Modules/no-undeclared-includes-builtins.cpp
+++ b/clang/test/Modules/no-undeclared-includes-builtins.cpp
@@ -8,7 +8,7 @@
 // headers.
 
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -I %S/Inputs/no-undeclared-includes-builtins/libcxx -I %S/Inputs/no-undeclared-includes-builtins/glibc %s
+// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fbuiltin-headers-in-system-modules -fimplicit-module-maps -I %S/Inputs/no-undeclared-includes-builtins/libcxx -I %S/Inputs/no-undeclared-includes-builtins/glibc %s
 // expected-no-diagnostics
 
 #include <stddef.h>
diff --git a/clang/test/Modules/polluted-operator.cppm b/clang/test/Modules/polluted-operator.cppm
index b24464aa6ad21e..721ca061c939f4 100644
--- a/clang/test/Modules/polluted-operator.cppm
+++ b/clang/test/Modules/polluted-operator.cppm
@@ -4,6 +4,12 @@
 //
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/a.cppm -o %t/a.pcm
 // RUN: %clang_cc1 -std=c++20 %t/b.cppm -fprebuilt-module-path=%t -emit-module-interface -o %t/b.pcm -verify
+//
+// Testing the behavior of `-fskip-odr-check-in-gmf`
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf  -emit-module-interface %t/a.cppm -o \
+// RUN:   %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf  %t/b.cppm -fprebuilt-module-path=%t \
+// RUN:   -emit-module-interface -DSKIP_ODR_CHECK_IN_GMF -o %t/b.pcm -verify
 
 //--- foo.h
 
@@ -51,7 +57,11 @@ module;
 export module b;
 import a;
 
+#ifdef SKIP_ODR_CHECK_IN_GMF
+// expected-no-diagnostics
+#else
 // expected-error@* {{has different definitions in different modules; first difference is defined here found data member '_S_copy_ctor' with an initializer}}
 // expected-note@* {{but in 'a.<global>' found data member '_S_copy_ctor' with a different initializer}}
 // expected-error@* {{from module 'a.<global>' is not present in definition of 'variant<_Types...>' provided earlier}}
 // expected-note@* {{declaration of 'swap' does not match}}
+#endif
diff --git a/clang/test/Modules/pr76638.cppm b/clang/test/Modules/pr76638.cppm
index 8cc807961421b7..e4820ba3d79d96 100644
--- a/clang/test/Modules/pr76638.cppm
+++ b/clang/test/Modules/pr76638.cppm
@@ -10,6 +10,12 @@
 // RUN: %clang_cc1 -std=c++20 %t/mod4.cppm -fmodule-file=mod3=%t/mod3.pcm \
 // RUN:     -fsyntax-only -verify
 
+// Testing the behavior of `-fskip-odr-check-in-gmf`
+// RUN: %clang_cc1 -std=c++20 %t/mod3.cppm -fskip-odr-check-in-gmf \
+// RUN:     -emit-module-interface -o %t/mod3.pcm
+// RUN: %clang_cc1 -std=c++20 %t/mod4.cppm -fmodule-file=mod3=%t/mod3.pcm \
+// RUN:     -fskip-odr-check-in-gmf -DSKIP_ODR_CHECK_IN_GMF -fsyntax-only -verify
+
 //--- size_t.h
 
 extern "C" {
@@ -65,5 +71,9 @@ export module mod4;
 import mod3;
 export using std::align_val_t;
 
+#ifdef SKIP_ODR_CHECK_IN_GMF
+// expected-no-diagnostics
+#else
 // expected-error@align.h:* {{'std::align_val_t' has different definitions in different modules; defined here first difference is enum with specified type 'size_t' (aka 'int')}}
 // expected-note@align.h:* {{but in 'mod3.<global>' found enum with specified type 'size_t' (aka 'unsigned int')}}
+#endif
diff --git a/clang/test/Modules/skip-odr-check-in-gmf.cppm b/clang/test/Modules/skip-odr-check-in-gmf.cppm
new file mode 100644
index 00000000000000..3ee7d09224bfa2
--- /dev/null
+++ b/clang/test/Modules/skip-odr-check-in-gmf.cppm
@@ -0,0 +1,56 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// Baseline testing to make sure we can detect the ODR violation from the CC1 invocation.
+// RUNX: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm
+// RUNX: %clang_cc1 -std=c++20 %t/b.cppm -emit-module-interface -o %t/b.pcm
+// RUNX: %clang_cc1 -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -verify
+//
+// Testing that we can ignore the ODR violation from the driver invocation.
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/b.cppm --precompile -o %t/b.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify \
+// RUN:     -DIGNORE_ODR_VIOLATION
+//
+// Testing that the driver can require to check the ODR violation.
+// RUN: %clang -std=c++20 -Xclang -fno-skip-odr-check-in-gmf %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 -Xclang -fno-skip-odr-check-in-gmf %t/b.cppm --precompile -o %t/b.pcm
+// RUN: %clang -std=c++20 -Xclang -fno-skip-odr-check-in-gmf %t/test.cc -fprebuilt-module-path=%t \
+// RUN:     -fsyntax-only -Xclang -verify
+
+//--- func1.h
+bool func(int x, int y) {
+    return true;
+}
+
+//--- func2.h
+bool func(int x, int y) {
+    return false;
+}
+
+//--- a.cppm
+module;
+#include "func1.h"
+export module a;
+export using ::func;
+
+//--- b.cppm
+module;
+#include "func2.h"
+export module b;
+export using ::func;
+
+//--- test.cc
+import a;
+import b;
+bool test() {
+    return func(1, 2);
+}
+
+#ifdef IGNORE_ODR_VIOLATION
+// expected-no-diagnostics
+#else
+// expected-error@func2.h:1 {{'func' has different definitions in different modules;}}
+// expected-note@func1.h:1 {{but in 'a.<global>' found a different body}}
+#endif
diff --git a/clang/test/Modules/stddef.c b/clang/test/Modules/stddef.c
index 5bc0d1e44c8563..76239826146810 100644
--- a/clang/test/Modules/stddef.c
+++ b/clang/test/Modules/stddef.c
@@ -1,29 +1,33 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fbuiltin-headers-in-system-modules -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify -fno-modules-error-recovery
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fbuiltin-headers-in-system-modules -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify=builtin-headers-in-system-modules -fno-modules-error-recovery
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify -fno-modules-error-recovery
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify=no-builtin-headers-in-system-modules -fno-modules-error-recovery
 
 #include "ptrdiff_t.h"
 
 ptrdiff_t pdt;
 
-// size_t is declared in both size_t.h and __stddef_size_t.h, both of which are
-// modular headers. Regardless of whether stddef.h joins the StdDef test module
-// or is in its _Builtin_stddef module, __stddef_size_t.h will be in
-// _Builtin_stddef.size_t. It's not defined which module will win as the expected
-// provider of size_t. For the purposes of this test it doesn't matter which header
-// gets reported, just as long as it isn't other.h or include_again.h.
-size_t st; // expected-error-re {{missing '#include "{{size_t|__stddef_size_t}}.h"'; 'size_t' must be declared before it is used}}
-// expected-note@size_t.h:* 0+ {{here}}
-// expected-note@__stddef_size_t.h:* 0+ {{here}}
+// size_t is declared in both size_t.h and __stddef_size_t.h. If
+// -fbuiltin-headers-in-system-modules is set, then __stddef_size_t.h is a
+// non-modular header that will be transitively pulled in the StdDef test module
+// by include_again.h. Otherwise it will be in the _Builtin_stddef module. In
+// any case it's not defined which module will win as the expected provider of
+// size_t. For the purposes of this test it doesn't matter which of the two
+// providing headers get reported.
+size_t st; // builtin-headers-in-system-modules-error-re {{missing '#include "{{size_t|include_again}}.h"'; 'size_t' must be declared before it is used}} \
+              no-builtin-headers-in-system-modules-error-re {{missing '#include "{{size_t|__stddef_size_t}}.h"'; 'size_t' must be declared before it is used}}
+// builtin-headers-in-system-modules-note@size_t.h:* 0+ {{here}} \
+   no-builtin-headers-in-system-modules-note@size_t.h:* 0+ {{here}}
+// builtin-headers-in-system-modules-note@__stddef_size_t.h:* 0+ {{here}} \
+   no-builtin-headers-in-system-modules-note@__stddef_size_t.h:* 0+ {{here}}
 
 #include "include_again.h"
-// Includes <stddef.h> which includes <__stddef_size_t.h> which imports the
-// _Builtin_stddef.size_t module.
+// Includes <stddef.h> which includes <__stddef_size_t.h>.
 
 size_t st2;
 
 #include "size_t.h"
-// Redeclares size_t, but the type merger should figure it out.
+// Redeclares size_t when -fbuiltin-headers-in-system-modules is not passed, but
+// the type merger should figure it out.
 
 size_t st3;
diff --git a/clang/test/OpenMP/bug54082.c b/clang/test/OpenMP/bug54082.c
index b88b68fd43012a..337c120983e0a3 100644
--- a/clang/test/OpenMP/bug54082.c
+++ b/clang/test/OpenMP/bug54082.c
@@ -69,9 +69,7 @@ void foo() {
 // CHECK-NEXT:    [[X_TRAITS:%.*]] = alloca [1 x %struct.omp_alloctrait_t], align 16
 // CHECK-NEXT:    [[X_ALLOC:%.*]] = alloca i64, align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[X_TRAITS]]) #[[ATTR5:[0-9]+]]
-// CHECK-NEXT:    store i32 2, ptr [[X_TRAITS]], align 16
-// CHECK-NEXT:    [[LOC0:%.*]] = getelementptr inbounds i8, ptr [[X_TRAITS]], i64 8
-// CHECK-NEXT:    store i64 64, ptr [[LOC0]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) [[X_TRAITS]], ptr noundef nonnull align 16 dereferenceable(16) @__const.foo.x_traits, i64 16, i1 false)
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[X_ALLOC]]) #[[ATTR5]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i64 @omp_init_allocator(i64 noundef 0, i32 noundef 1, ptr noundef nonnull [[X_TRAITS]]) #[[ATTR5]]
 // CHECK-NEXT:    store i64 [[CALL]], ptr [[X_ALLOC]], align 8, !tbaa [[TBAA3:![0-9]+]]
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 15879da04fcf0e..1e9aec3fdf2373 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -60,6 +60,10 @@
 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 512
 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 1024
 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 2048
+// CHECK: __ARM_STATE_ZA 1
+// CHECK: __ARM_STATE_ZT0 1
+// CHECK-NOT: __ARM_FEATURE_SME
+// CHECK-NOT: __ARM_FEATURE_SME2
 
 // RUN: %clang -target aarch64-none-elf -march=armv8-r -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-R-PROFILE
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-r -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-R-PROFILE
@@ -314,15 +318,15 @@
 // CHECK-MCPU-APPLE-A7: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes"{{.*}} "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-APPLE-A10: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes"{{.*}} "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-APPLE-A11: "-cc1"{{.*}} "-triple" "aarch64{{.*}}"{{.*}}"-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-APPLE-A12: "-cc1"{{.*}} "-triple" "aarch64"{{.*}} "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
+// CHECK-MCPU-APPLE-A12: "-cc1"{{.*}} "-triple" "aarch64"{{.*}} "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-A34: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-APPLE-A13: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "apple-a13" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon"
+// CHECK-MCPU-APPLE-A13: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "apple-a13" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon"
 // CHECK-MCPU-A35: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-A53: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-A57: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-A72: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-CORTEX-R82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8r" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+neon" "-target-feature" "+ssbs"
+// CHECK-MCPU-CORTEX-R82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8r" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+neon" "-target-feature" "+ssbs"
 // CHECK-MCPU-M3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-M4: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
@@ -331,10 +335,10 @@
 // CHECK-MCPU-CARMEL: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64 %s
-// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.5a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon"
+// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.5a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64_32 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64_32 %s
-// CHECK-ARCH-ARM64_32: "-target-cpu" "apple-s4" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
+// CHECK-ARCH-ARM64_32: "-target-cpu" "apple-s4" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
 
 // RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
 // RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
@@ -497,9 +501,10 @@
 // CHECK-MEMTAG: __ARM_FEATURE_MEMORY_TAGGING 1
 
 // ================== Check Pointer Authentication Extension (PAuth).
-// RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-OFF %s
-// RUN: %clang -target arm64-none-linux-gnu -march=armv8.5-a -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-OFF %s
-// RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth -mbranch-protection=none -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-ON %s
+// RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-OFF,CHECK-CPU-NOPAUTH %s
+// RUN: %clang -target arm64-none-linux-gnu -march=armv8.5-a+nopauth -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-OFF,CHECK-CPU-NOPAUTH %s
+// RUN: %clang -target arm64-none-linux-gnu -march=armv8.5-a -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-OFF,CHECK-CPU-PAUTH %s
+// RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth -mbranch-protection=none -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-OFF,CHECK-CPU-PAUTH %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=none -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-OFF %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=bti -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-OFF %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=standard -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH %s
@@ -507,12 +512,18 @@
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=pac-ret+b-key -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-BKEY %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=pac-ret+leaf -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-ALL %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -mbranch-protection=pac-ret+leaf+b-key -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-BKEY-ALL %s
-// CHECK-PAUTH-OFF-NOT:  __ARM_FEATURE_PAC_DEFAULT
-// CHECK-PAUTH:          #define __ARM_FEATURE_PAC_DEFAULT 1
-// CHECK-PAUTH-BKEY:     #define __ARM_FEATURE_PAC_DEFAULT 2
-// CHECK-PAUTH-ALL:      #define __ARM_FEATURE_PAC_DEFAULT 5
-// CHECK-PAUTH-BKEY-ALL: #define __ARM_FEATURE_PAC_DEFAULT 6
-// CHECK-PAUTH-ON:       #define __ARM_FEATURE_PAUTH 1
+//
+// Note: PAUTH-OFF - pac-ret is disabled
+//       CPU-NOPAUTH - FEAT_PAUTH support is disabled (but pac-ret can still use HINT-encoded instructions)
+//
+// CHECK-CPU-NOPAUTH-NOT: __ARM_FEATURE_PAUTH
+// CHECK-PAUTH-OFF-NOT:   __ARM_FEATURE_PAC_DEFAULT
+// CHECK-PAUTH:           #define __ARM_FEATURE_PAC_DEFAULT 1
+// CHECK-PAUTH-BKEY:      #define __ARM_FEATURE_PAC_DEFAULT 2
+// CHECK-PAUTH-ALL:       #define __ARM_FEATURE_PAC_DEFAULT 5
+// CHECK-PAUTH-BKEY-ALL:  #define __ARM_FEATURE_PAC_DEFAULT 6
+// CHECK-CPU-PAUTH:       #define __ARM_FEATURE_PAUTH 1
+// CHECK-CPU-NOPAUTH-NOT: __ARM_FEATURE_PAUTH
 
 // ================== Check Branch Target Identification (BTI).
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-BTI-OFF %s
@@ -634,3 +645,12 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.2-a+rcpc3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-RCPC3 %s
 // CHECK-RCPC3: __ARM_FEATURE_RCPC 3
+
+// RUN: %clang --target=aarch64 -march=armv9-a+sme -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME %s
+// CHECK-SME: __ARM_FEATURE_LOCALLY_STREAMING 1
+// CHECK-SME: __ARM_FEATURE_SME 1
+//
+// RUN: %clang --target=aarch64 -march=armv9-a+sme2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2 %s
+// CHECK-SME2: __ARM_FEATURE_LOCALLY_STREAMING 1
+// CHECK-SME2: __ARM_FEATURE_SME 1
+// CHECK-SME2: __ARM_FEATURE_SME2 1
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index f1f1bbbf66945c..cf96870b27acb3 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -32,6 +32,8 @@
 // AARCH64-NEXT: #define __ARM_PCS_AAPCS64 1
 // AARCH64-NEXT: #define __ARM_SIZEOF_MINIMAL_ENUM 4
 // AARCH64-NEXT: #define __ARM_SIZEOF_WCHAR_T 4
+// AARCH64-NEXT: #define __ARM_STATE_ZA 1
+// AARCH64-NEXT: #define __ARM_STATE_ZT0 1
 // AARCH64-NEXT: #define __ATOMIC_ACQUIRE 2
 // AARCH64-NEXT: #define __ATOMIC_ACQ_REL 4
 // AARCH64-NEXT: #define __ATOMIC_CONSUME 1
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 39d2c66f14b23f..30697af89c2ebe 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -764,6 +764,14 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZICNTR-EXT %s
 // CHECK-ZICNTR-EXT: __riscv_zicntr 2000000{{$}}
 
+// RUN: %clang --target=riscv32 \
+// RUN: -march=rv32i_zicond1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
+// RUN: %clang --target=riscv64 \
+// RUN: -march=rv64i_zicond1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
+// CHECK-ZICOND-EXT: __riscv_zicond  1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN: -march=rv32izicsr2p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZICSR-EXT %s
@@ -1332,14 +1340,6 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZICFILP-EXT %s
 // CHECK-ZICFILP-EXT: __riscv_zicfilp 4000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32i_zicond1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64i_zicond1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
-// CHECK-ZICOND-EXT: __riscv_zicond  1000000{{$}}
-
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
 // RUN: -march=rv32i_zimop0p1 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 97409ae7d6040c..2bf1886951f1f7 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify=expected-cpp -x c++ %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify=expected-cpp -x c++ %s
 
 // Valid attributes
 
@@ -445,3 +445,12 @@ void conflicting_state_attrs_preserves_out_zt0(void) __arm_preserves("zt0") __ar
 // expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
 // expected-error@+1 {{conflicting attributes for state 'zt0'}}
 void conflicting_state_attrs_preserves_inout_zt0(void) __arm_preserves("zt0") __arm_inout("zt0");
+
+// Test that we get a diagnostic for unimplemented case.
+void unimplemented_spill_fill_za(void (*share_zt0_only)(void) __arm_inout("zt0")) __arm_inout("za", "zt0") {
+  // expected-cpp-error@+4 {{call to a function that shares state other than 'za' from a function that has live 'za' state requires a spill/fill of ZA, which is not yet implemented}}
+  // expected-cpp-note@+3 {{add '__arm_preserves("za")' to the callee if it preserves ZA}}
+  // expected-error@+2 {{call to a function that shares state other than 'za' from a function that has live 'za' state requires a spill/fill of ZA, which is not yet implemented}}
+  // expected-note@+1 {{add '__arm_preserves("za")' to the callee if it preserves ZA}}
+  share_zt0_only();
+}
diff --git a/clang/test/Sema/attr-riscv-rvv-vector-bits.c b/clang/test/Sema/attr-riscv-rvv-vector-bits.c
index fe507a102cee1e..60ba2aa034f6e1 100644
--- a/clang/test/Sema/attr-riscv-rvv-vector-bits.c
+++ b/clang/test/Sema/attr-riscv-rvv-vector-bits.c
@@ -228,8 +228,19 @@ typedef vint8m1_t two_arguments __attribute__((riscv_rvv_vector_bits(2, 4))); //
 typedef vint8m1_t non_int_size1 __attribute__((riscv_rvv_vector_bits(2.0)));   // expected-error {{'riscv_rvv_vector_bits' attribute requires an integer constant}}
 typedef vint8m1_t non_int_size2 __attribute__((riscv_rvv_vector_bits("256"))); // expected-error {{'riscv_rvv_vector_bits' attribute requires an integer constant}}
 
-// bool types and LMUL != 1 are not supported.
-typedef vbool1_t fixed_vbool1_t_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); // expected-error {{'riscv_rvv_vector_bits' attribute applied to non-RVV type 'vbool1_t'}}
+typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
+typedef vbool2_t fixed_bool2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 2)));
+typedef vbool4_t fixed_bool4_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 4)));
+typedef vbool8_t fixed_bool8_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 8)));
+#if __riscv_v_fixed_vlen / 16 >= 8
+typedef vbool16_t fixed_bool16_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 16)));
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+typedef vbool32_t fixed_bool32_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 32)));
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 64)));
+#endif
 
 // Attribute must be attached to a single RVV vector or predicate type.
 typedef void *badtype1 __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));         // expected-error {{'riscv_rvv_vector_bits' attribute applied to non-RVV type 'void *'}}
@@ -242,10 +253,13 @@ vint8m1_t non_typedef_type __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_
 // Test that we can define non-local fixed-length RVV types (unsupported for
 // sizeless types).
 fixed_int8m1_t global_int8;
+fixed_bool1_t global_bool1;
 
 extern fixed_int8m1_t extern_int8;
+extern fixed_bool1_t extern_bool1;
 
 static fixed_int8m1_t static_int8;
+static fixed_bool1_t static_bool1;
 
 fixed_int8m1_t *global_int8_ptr;
 extern fixed_int8m1_t *extern_int8_ptr;
@@ -398,6 +412,20 @@ _Static_assert(sizeof(fixed_int64m8_t) == VECTOR_SIZE * 8, "");
 _Static_assert(sizeof(fixed_float32m8_t) == VECTOR_SIZE * 8, "");
 _Static_assert(sizeof(fixed_float64m8_t) == VECTOR_SIZE * 8, "");
 
+_Static_assert(sizeof(fixed_bool1_t) == VECTOR_SIZE, "");
+_Static_assert(sizeof(fixed_bool2_t) == VECTOR_SIZE / 2, "");
+_Static_assert(sizeof(fixed_bool4_t) == VECTOR_SIZE / 4, "");
+_Static_assert(sizeof(fixed_bool8_t) == VECTOR_SIZE / 8, "");
+#if __riscv_v_fixed_vlen / 16 >= 8
+_Static_assert(sizeof(fixed_bool16_t) == VECTOR_SIZE / 16, "");
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+_Static_assert(sizeof(fixed_bool32_t) == VECTOR_SIZE / 32, "");
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+_Static_assert(sizeof(fixed_bool64_t) == VECTOR_SIZE / 64, "");
+#endif
+
 // --------------------------------------------------------------------------//
 // Alignof
 
@@ -475,6 +503,20 @@ _Static_assert(__alignof__(fixed_uint64m8_t) == VECTOR_ALIGN, "");
 _Static_assert(__alignof__(fixed_float32m8_t) == VECTOR_ALIGN, "");
 _Static_assert(__alignof__(fixed_float64m8_t) == VECTOR_ALIGN, "");
 
+_Static_assert(__alignof__(fixed_bool1_t) == VECTOR_ALIGN, "");
+_Static_assert(__alignof__(fixed_bool2_t) == (sizeof(fixed_bool2_t) < VECTOR_ALIGN ? sizeof(fixed_bool2_t) : VECTOR_ALIGN), "");
+_Static_assert(__alignof__(fixed_bool4_t) == (sizeof(fixed_bool4_t) < VECTOR_ALIGN ? sizeof(fixed_bool4_t) : VECTOR_ALIGN), "");
+_Static_assert(__alignof__(fixed_bool8_t) == (sizeof(fixed_bool8_t) < VECTOR_ALIGN ? sizeof(fixed_bool8_t) : VECTOR_ALIGN), "");
+#if __riscv_v_fixed_vlen / 16 >= 8
+_Static_assert(__alignof__(fixed_bool16_t) == (sizeof(fixed_bool16_t) < VECTOR_ALIGN ? sizeof(fixed_bool16_t) : VECTOR_ALIGN), "");
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+_Static_assert(__alignof__(fixed_bool32_t) == (sizeof(fixed_bool32_t) < VECTOR_ALIGN ? sizeof(fixed_bool32_t) : VECTOR_ALIGN), "");
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+_Static_assert(__alignof__(fixed_bool64_t) == (sizeof(fixed_bool64_t) < VECTOR_ALIGN ? sizeof(fixed_bool64_t) : VECTOR_ALIGN), "");
+#endif
+
 // --------------------------------------------------------------------------//
 // Structs
 
@@ -580,6 +622,26 @@ TEST_CAST_VECTOR(uint64m8)
 TEST_CAST_VECTOR(float32m8)
 TEST_CAST_VECTOR(float64m8)
 
+TEST_CAST_COMMON(bool1);
+TEST_CAST_COMMON(bool2);
+TEST_CAST_COMMON(bool4);
+TEST_CAST_COMMON(bool8);
+#if __riscv_v_fixed_vlen / 16 >= 8
+TEST_CAST_COMMON(bool16);
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+TEST_CAST_COMMON(bool32);
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+TEST_CAST_COMMON(bool64);
+#endif
+
+// Test conversion between mask and uint8 is invalid, both have the same
+// memory representation.
+fixed_bool1_t to_fixed_bool1_t__from_vuint8m1_t(vuint8m1_t x) { return x; } // expected-error-re {{returning 'vuint8m1_t' (aka '__rvv_uint8m1_t') from a function with incompatible result type 'fixed_bool1_t' (vector of {{[0-9]+}} 'unsigned char' values)}}
+
+// --------------------------------------------------------------------------//
+
 // --------------------------------------------------------------------------//
 // Test the scalable and fixed-length types can be used interchangeably
 
@@ -595,6 +657,14 @@ vfloat64m4_t __attribute__((overloadable)) vfunc(vfloat64m4_t op1, vfloat64m4_t
 vint32m8_t __attribute__((overloadable)) vfunc(vint32m8_t op1, vint32m8_t op2);
 vfloat64m8_t __attribute__((overloadable)) vfunc(vfloat64m8_t op1, vfloat64m8_t op2);
 
+vbool1_t __attribute__((overloadable)) vfunc(vbool1_t op1, vbool1_t op2);
+vbool2_t __attribute__((overloadable)) vfunc(vbool2_t op1, vbool2_t op2);
+vbool4_t __attribute__((overloadable)) vfunc(vbool4_t op1, vbool4_t op2);
+vbool8_t __attribute__((overloadable)) vfunc(vbool8_t op1, vbool8_t op2);
+vbool16_t __attribute__((overloadable)) vfunc(vbool16_t op1, vbool16_t op2);
+vbool32_t __attribute__((overloadable)) vfunc(vbool32_t op1, vbool32_t op2);
+vbool64_t __attribute__((overloadable)) vfunc(vbool64_t op1, vbool64_t op2);
+
 #define TEST_CALL(TYPE)                                              \
   fixed_##TYPE##_t                                                   \
       call_##TYPE##_ff(fixed_##TYPE##_t op1, fixed_##TYPE##_t op2) { \
@@ -621,6 +691,20 @@ TEST_CALL(float64m4)
 TEST_CALL(int32m8)
 TEST_CALL(float64m8)
 
+TEST_CALL(bool1)
+TEST_CALL(bool2)
+TEST_CALL(bool4)
+TEST_CALL(bool8)
+#if __riscv_v_fixed_vlen / 16 >= 8
+TEST_CALL(bool16)
+#endif
+#if __riscv_v_fixed_vlen / 32 >= 8
+TEST_CALL(bool32)
+#endif
+#if __riscv_v_fixed_vlen / 64 >= 8
+TEST_CALL(bool64)
+#endif
+
 // --------------------------------------------------------------------------//
 // Vector initialization
 
diff --git a/clang/test/Sema/inline-asm-validate-mips.c b/clang/test/Sema/inline-asm-validate-mips.c
new file mode 100644
index 00000000000000..7da248fe417b5c
--- /dev/null
+++ b/clang/test/Sema/inline-asm-validate-mips.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple mips64 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple mips64 -target-feature +soft-float -fsyntax-only -verify=softfloat %s
+
+// expected-no-diagnostics
+
+void test_f(float p) {
+  float result = p;
+  __asm__("" :: "f"(result)); // softfloat-error{{invalid input constraint 'f' in asm}}
+}
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
index 8a610fa0e737e1..03a432e05851d1 100644
--- a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
+++ b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
@@ -1,13 +1,31 @@
-// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan -triple powerpc64le-unknown-unknown %s \
-// RUN: -menable-no-infs -menable-no-nans
+// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
+// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
+// RUN: -menable-no-nans -std=c++23
 
-// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown %s
+// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
+// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
+// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
+// RUN: %s -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \
-// RUN: -menable-no-infs
+// RUN: -menable-no-infs -funsafe-math-optimizations -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-nans -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \
-// RUN: -menable-no-nans
+// RUN: -funsafe-math-optimizations -menable-no-nans -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
+// RUN: %s -Wno-nan-infinity-disabled -menable-no-infs -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
+// RUN: %s -Wno-nan-infinity-disabled -menable-no-nans -std=c++23
 
 // no-fast-no-diagnostics
 
@@ -133,13 +151,41 @@ int compareit(float a, float b) {
 // no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
   p = __builtin_isfinite(a);
 
-  // These should NOT warn, since they are not using NaN or infinity.
+// These should NOT warn, since they are not using NaN or infinity.
   j = a > 1.1;
   j = b < 1.1;
   j = a >= 1.1;
   j = b <= 1.1;
   j = isunorderedf(a, b);
 
+#ifndef INFINITY
+  j = a;
+#endif
+#ifndef NAN
+  j = b;
+#endif
+#ifdef INFINITY
+  j = a;
+#endif
+#ifdef NAN
+  j = b;
+#endif
+#if defined(INFINITY)
+  j = a;
+#elifndef(INFINITY)
+  j = b;
+#endif
+#if defined(INFINITY)
+  j = a;
+#elifndef(NAN)
+  j = b;
+#endif
+#if defined(NAN)
+  j = a;
+#elifndef(INFINITY)
+  j = b;
+#endif
+
 // no-inf-no-nan-warning@+4 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-no-nan-warning@+3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
 // no-nan-warning@+2 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
@@ -173,4 +219,4 @@ int compareit(float a, float b) {
   j = numeric_limits<float>::infinity();
   return 0;
 
-}  
+}
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
index 19a575386e3293..51f9d325619ba0 100644
--- a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
+++ b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
@@ -1,16 +1,34 @@
 // Use of NAN macro will trigger a warning "infinity defined in macro" because
 // on Windows the NAN macro is defined using INFINITY. See below.
 
-// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan -triple powerpc64le-unknown-unknown %s \
-// RUN: -menable-no-infs -menable-no-nans
+// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
+// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
+// RUN: -menable-no-nans -std=c++23
 
-// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown %s
+// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
+// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
+// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
+// RUN: %s -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \
-// RUN: -menable-no-infs
+// RUN: -menable-no-infs -funsafe-math-optimizations -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-nans -std=c++23
 
 // RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \
-// RUN: -menable-no-nans
+// RUN: -funsafe-math-optimizations -menable-no-nans -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
+// RUN: %s -Wno-nan-infinity-disabled -menable-no-infs -std=c++23
+
+// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
+// RUN: %s -Wno-nan-infinity-disabled -menable-no-nans -std=c++23
 
 // no-fast-no-diagnostics
 
@@ -136,13 +154,41 @@ int compareit(float a, float b) {
 // no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
   p = __builtin_isfinite(a);
 
-  // These should NOT warn, since they are not using NaN or infinity.
+// These should NOT warn, since they are not using NaN or infinity.
   j = a > 1.1;
   j = b < 1.1;
   j = a >= 1.1;
   j = b <= 1.1;
   j = isunorderedf(a, b);
 
+#ifndef INFINITY
+  j = a;
+#endif
+#ifndef NAN
+  j = b;
+#endif
+#ifdef INFINITY
+  j = a;
+#endif
+#ifdef NAN
+  j = b;
+#endif
+#if defined(INFINITY)
+  j = a;
+#elifndef(INFINITY)
+  j = b;
+#endif
+#if defined(INFINITY)
+  j = a;
+#elifndef(NAN)
+  j = b;
+#endif
+#if defined(NAN)
+  j = a;
+#elifndef(INFINITY)
+  j = b;
+#endif
+
 // no-inf-no-nan-warning@+4 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point option}}
 // no-inf-no-nan-warning@+3 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
@@ -176,4 +222,4 @@ int compareit(float a, float b) {
   j = numeric_limits<float>::infinity();
   return 0;
 
-}  
+}
diff --git a/clang/test/Sema/warn-int-in-bool-context.c b/clang/test/Sema/warn-int-in-bool-context.c
index 0c94ebb391f3c5..99f3db9f8d41a7 100644
--- a/clang/test/Sema/warn-int-in-bool-context.c
+++ b/clang/test/Sema/warn-int-in-bool-context.c
@@ -72,3 +72,14 @@ int test(int a, unsigned b, enum num n) {
   // Don't warn in macros.
   return SHIFT(1, a);
 }
+
+int GH64356(int arg) {
+  if ((arg == 1) && (1 == 1)) return 1;
+    return 0;
+
+  if ((64 > 32) && (32 < 64))
+    return 2;
+
+  if ((1 == 1) && (arg == 1)) return 1;
+    return 0;
+}
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 531a6262287335..4a75392045d05a 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -368,3 +368,29 @@ vector<void> v{};
 // expected-note@-2 {{in call to 'vector()'}}
 
 }
+
+
+namespace GH82258 {
+
+template <class R, class Pred>
+constexpr auto none_of(R&& r, Pred pred) -> bool { return true; }
+
+struct info { int value; };
+consteval auto is_invalid(info i) -> bool { return false; }
+constexpr info types[] = { {1}, {3}, {5}};
+
+static_assert(none_of(
+    types,
+    +[](info i) consteval {
+        return is_invalid(i);
+    }
+));
+
+static_assert(none_of(
+    types,
+    []{
+        return is_invalid;
+    }()
+));
+
+}
diff --git a/clang/test/SemaCXX/cxx2b-static-operator.cpp b/clang/test/SemaCXX/cxx2b-static-operator.cpp
new file mode 100644
index 00000000000000..4d6f1f76d13157
--- /dev/null
+++ b/clang/test/SemaCXX/cxx2b-static-operator.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++23 %s
+
+// expected-no-diagnostics
+
+namespace A {
+
+struct Foo {
+  static int operator()(int a, int b) { return a + b; }
+  static int operator[](int a, int b) { return a + b; }
+};
+
+void ok() {
+  // Should pass regardless of const / volatile
+  Foo foo;
+  foo(1, 2);
+  foo[1, 2];
+
+  const Foo fooC;
+  fooC(1, 2);
+  fooC[1, 2];
+
+  const Foo fooV;
+  fooV(1, 2);
+  fooV[1, 2];
+
+  const volatile Foo fooCV;
+  fooCV(1, 2);
+  fooCV[1, 2];
+}
+
+}
diff --git a/clang/test/SemaCXX/gh53815.cpp b/clang/test/SemaCXX/gh53815.cpp
new file mode 100644
index 00000000000000..326c911c7bfaf5
--- /dev/null
+++ b/clang/test/SemaCXX/gh53815.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
+// expected-no-diagnostics
+
+// Check that we don't crash due to forgetting to check for placeholders
+// in the RHS of '.*'.
+
+template <typename Fn>
+static bool has_explicitly_named_overload() {
+  return requires { Fn().*&Fn::operator(); };
+}
+
+int main() {
+  has_explicitly_named_overload<decltype([](auto){})>();
+}
+
+template <typename Fn>
+constexpr bool has_explicitly_named_overload_2() {
+  return requires { Fn().*&Fn::operator(); };
+}
+
+static_assert(!has_explicitly_named_overload_2<decltype([](auto){})>());
diff --git a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
index bda6a65c02168b..d54b394df4eb84 100644
--- a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
+++ b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow -D AVOID %s
-// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow -Wshadow-uncaptured-local %s
-// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow-all %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow -D AVOID %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow -Wshadow-uncaptured-local %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow-all %s
 // RUN: %clang_cc1 -std=c++17 -verify -fsyntax-only -Wshadow-all %s
 // RUN: %clang_cc1 -std=c++20 -verify -fsyntax-only -Wshadow-all %s
 
@@ -179,3 +179,89 @@ void f() {
 #endif
 }
 }
+
+namespace GH71976 {
+#ifdef AVOID
+struct A {
+  int b = 5;
+  int foo() {
+    return [b = b]() { return b; }(); // no -Wshadow diagnostic, init-capture does not shadow b due to not capturing this
+  }
+};
+
+struct B {
+  int a;
+  void foo() {
+    auto b = [a = this->a] {}; // no -Wshadow diagnostic, init-capture does not shadow a due to not capturing his
+  }
+};
+
+struct C {
+  int b = 5;
+  int foo() {
+    return [a = b]() {
+      return [=, b = a]() { // no -Wshadow diagnostic, init-capture does not shadow b due to outer lambda
+        return b;
+      }();
+    }();
+  }
+};
+
+#else
+struct A {
+  int b = 5; // expected-note {{previous}}
+  int foo() {
+    return [b = b]() { return b; }(); // expected-warning {{declaration shadows a field}}
+  }
+};
+
+struct B {
+  int a; // expected-note {{previous}}
+  void foo() {
+    auto b = [a = this->a] {}; // expected-warning {{declaration shadows a field}}
+  }
+};
+
+struct C {
+  int b = 5; // expected-note {{previous}}
+  int foo() {
+    return [a = b]() {
+      return [=, b = a]() { // expected-warning {{declaration shadows a field}}
+        return b;
+      }();
+    }();
+  }
+};
+
+struct D {
+  int b = 5; // expected-note {{previous}}
+  int foo() {
+    return [b = b, this]() { return b; }(); // expected-warning {{declaration shadows a field}}
+  }
+};
+
+struct E {
+  int b = 5;
+  int foo() {
+    return [a = b]() { // expected-note {{previous}}
+      return [=, a = a]() { // expected-warning {{shadows a local}}
+        return a;
+      }();
+    }();
+  }
+};
+
+#endif
+
+struct S {
+    int a ;
+};
+
+int foo() {
+  auto [a] = S{0}; // expected-note {{previous}} \
+                   // cxx14-warning {{decomposition declarations are a C++17 extension}}
+  [a = a] () { // expected-warning {{declaration shadows a structured binding}}
+  }();
+}
+
+}
diff --git a/clang/test/SemaTemplate/concepts-friends.cpp b/clang/test/SemaTemplate/concepts-friends.cpp
index 255b0858917fb6..0b008811f13621 100644
--- a/clang/test/SemaTemplate/concepts-friends.cpp
+++ b/clang/test/SemaTemplate/concepts-friends.cpp
@@ -478,3 +478,29 @@ template <Concept> class Foo {
 };
 
 } // namespace FriendOfFriend
+
+namespace GH86769 {
+
+template <typename T>
+concept X = true;
+
+template <X T> struct Y {
+  Y(T) {}
+  template <X U> friend struct Y;
+  template <X U> friend struct Y;
+  template <X U> friend struct Y;
+};
+
+template <class T>
+struct Z {
+  // FIXME: This is ill-formed per C++11 [temp.param]p12:
+  // A default template argument shall not be specified in a friend class
+  // template declaration.
+  template <X U = void> friend struct Y;
+};
+
+template struct Y<int>;
+template struct Z<int>;
+Y y(1);
+
+}
diff --git a/clang/test/SemaTemplate/concepts-lambda.cpp b/clang/test/SemaTemplate/concepts-lambda.cpp
index 7e431529427dff..0b7580f91043c7 100644
--- a/clang/test/SemaTemplate/concepts-lambda.cpp
+++ b/clang/test/SemaTemplate/concepts-lambda.cpp
@@ -149,3 +149,21 @@ void foo() {
   auto caller = make_caller.operator()<&S1::f1>();
 }
 } // namespace ReturnTypeRequirementInLambda
+
+namespace GH73418 {
+void foo() {
+  int x;
+  [&x](auto) {
+    return [](auto y) {
+      return [](auto obj, auto... params)
+        requires requires {
+          sizeof...(params);
+          [](auto... pack) {
+            return sizeof...(pack);
+          }(params...);
+        }
+      { return false; }(y);
+    }(x);
+  }(x);
+}
+} // namespace GH73418
diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp
index 388ed7d4cced18..ec144d4f44ba8c 100644
--- a/clang/test/SemaTemplate/ctad.cpp
+++ b/clang/test/SemaTemplate/ctad.cpp
@@ -53,4 +53,4 @@ X x;
 template<class T, class B> struct Y { Y(T); };
 template<class T, class B=void> struct Y ;
 Y y(1);
-};
+}
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
index b5b8cadc909ce0..ad73daa8e214c3 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
@@ -336,3 +336,38 @@ template<int ...Ns> void bar(B b) {
   (b.operator Tbar<Ns>(), ...);
 }
 }
+
+namespace ReportedRegression1 {
+  const char kt[] = "dummy";
+
+  template <class T, const char id[]>
+    class SomeTempl { };
+
+  template <const char id[]>
+    class SomeTempl<int, id> {
+      public:
+        int exit_code() const { return 0; }
+    };
+
+  int use() {
+    SomeTempl<int, kt> dummy;
+    return dummy.exit_code();
+  }
+}
+
+namespace ReportedRegression2 {
+  const char str[] = "dummy";
+
+  struct S {
+    S operator+(const char*) const;
+  };
+
+  template <const char* in>
+  void fn() {
+    auto s = S{} + in;
+  }
+
+  void use() {
+    fn<str>();
+  }
+}
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 5ee6092bb9bb7f..e122cea50f7268 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -399,7 +399,8 @@ class ClangFormatDiagConsumer : public DiagnosticConsumer {
 };
 
 // Returns true on error.
-static bool format(StringRef FileName, bool IsSTDIN) {
+static bool format(StringRef FileName) {
+  const bool IsSTDIN = FileName == "-";
   if (!OutputXML && Inplace && IsSTDIN) {
     errs() << "error: cannot use -i when reading from stdin.\n";
     return false;
@@ -545,24 +546,25 @@ static void PrintVersion(raw_ostream &OS) {
 }
 
 // Dump the configuration.
-static int dumpConfig(bool IsSTDIN) {
+static int dumpConfig() {
   std::unique_ptr<llvm::MemoryBuffer> Code;
-
-  // `FileNames` must have at least "-" in it even if no file was specified.
-  assert(!FileNames.empty());
-
-  // Read in the code in case the filename alone isn't enough to detect the
-  // language.
-  ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
-      MemoryBuffer::getFileOrSTDIN(FileNames[0]);
-  if (std::error_code EC = CodeOrErr.getError()) {
-    llvm::errs() << EC.message() << "\n";
-    return 1;
+  // We can't read the code to detect the language if there's no file name.
+  if (!FileNames.empty()) {
+    // Read in the code in case the filename alone isn't enough to detect the
+    // language.
+    ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
+        MemoryBuffer::getFileOrSTDIN(FileNames[0]);
+    if (std::error_code EC = CodeOrErr.getError()) {
+      llvm::errs() << EC.message() << "\n";
+      return 1;
+    }
+    Code = std::move(CodeOrErr.get());
   }
-  Code = std::move(CodeOrErr.get());
-
   llvm::Expected<clang::format::FormatStyle> FormatStyle =
-      clang::format::getStyle(Style, IsSTDIN ? AssumeFileName : FileNames[0],
+      clang::format::getStyle(Style,
+                              FileNames.empty() || FileNames[0] == "-"
+                                  ? AssumeFileName
+                                  : FileNames[0],
                               FallbackStyle, Code ? Code->getBuffer() : "");
   if (!FormatStyle) {
     llvm::errs() << llvm::toString(FormatStyle.takeError()) << "\n";
@@ -682,11 +684,8 @@ int main(int argc, const char **argv) {
     return 0;
   }
 
-  if (FileNames.empty())
-    FileNames.push_back("-");
-
   if (DumpConfig)
-    return dumpConfig(FileNames[0] == "-");
+    return dumpConfig();
 
   if (!Files.empty()) {
     std::ifstream ExternalFileOfFiles{std::string(Files)};
@@ -699,7 +698,10 @@ int main(int argc, const char **argv) {
     errs() << "Clang-formating " << LineNo << " files\n";
   }
 
-  if (FileNames.size() != 1 &&
+  if (FileNames.empty())
+    return clang::format::format("-");
+
+  if (FileNames.size() > 1 &&
       (!Offsets.empty() || !Lengths.empty() || !LineRanges.empty())) {
     errs() << "error: -offset, -length and -lines can only be used for "
               "single file.\n";
@@ -709,14 +711,13 @@ int main(int argc, const char **argv) {
   unsigned FileNo = 1;
   bool Error = false;
   for (const auto &FileName : FileNames) {
-    const bool IsSTDIN = FileName == "-";
-    if (!IsSTDIN && isIgnored(FileName))
+    if (isIgnored(FileName))
       continue;
     if (Verbose) {
       errs() << "Formatting [" << FileNo++ << "/" << FileNames.size() << "] "
              << FileName << "\n";
     }
-    Error |= clang::format::format(FileName, IsSTDIN);
+    Error |= clang::format::format(FileName);
   }
   return Error ? 1 : 0;
 }
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 2a8d79359a49b4..6436581ddae5ae 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -231,6 +231,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
                           AfterFunctionDefinitionName);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterIfMacros);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterOverloadedOperator);
+  CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterPlacementOperator);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, BeforeNonEmptyParentheses);
   CHECK_PARSE_NESTED_BOOL(SpacesInParensOptions, InCStyleCasts);
   CHECK_PARSE_NESTED_BOOL(SpacesInParensOptions, InConditionalStatements);
@@ -609,24 +610,6 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               SpaceBeforeParens,
               FormatStyle::SBPO_ControlStatementsExceptControlMacros);
 
-  Style.SpaceBeforeParens = FormatStyle::SBPO_Custom;
-  Style.SpaceBeforeParensOptions.AfterPlacementOperator =
-      FormatStyle::SpaceBeforeParensCustom::APO_Always;
-  CHECK_PARSE("SpaceBeforeParensOptions:\n"
-              "  AfterPlacementOperator: Never",
-              SpaceBeforeParensOptions.AfterPlacementOperator,
-              FormatStyle::SpaceBeforeParensCustom::APO_Never);
-
-  CHECK_PARSE("SpaceBeforeParensOptions:\n"
-              "  AfterPlacementOperator: Always",
-              SpaceBeforeParensOptions.AfterPlacementOperator,
-              FormatStyle::SpaceBeforeParensCustom::APO_Always);
-
-  CHECK_PARSE("SpaceBeforeParensOptions:\n"
-              "  AfterPlacementOperator: Leave",
-              SpaceBeforeParensOptions.AfterPlacementOperator,
-              FormatStyle::SpaceBeforeParensCustom::APO_Leave);
-
   // For backward compatibility:
   Style.SpacesInParens = FormatStyle::SIPO_Never;
   Style.SpacesInParensOptions = {};
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index e5e763edf5b5bf..88877e53d014c6 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -1865,6 +1865,13 @@ TEST_F(FormatTest, UnderstandsMacros) {
   verifyFormat("MACRO(co_return##something)");
 
   verifyFormat("#define A x:");
+
+  verifyFormat("#define Foo(Bar) {#Bar}", "#define Foo(Bar) \\\n"
+                                          "  { \\\n"
+                                          "    #Bar \\\n"
+                                          "  }");
+  verifyFormat("#define Foo(Bar) {#Bar}", "#define Foo(Bar) \\\n"
+                                          "  { #Bar }");
 }
 
 TEST_F(FormatTest, ShortBlocksInMacrosDontMergeWithCodeAfterMacro) {
@@ -10865,7 +10872,7 @@ TEST_F(FormatTest, UnderstandsTemplateParameters) {
   verifyFormat("some_templated_type<decltype([](int i) { return i; })>");
 
   verifyFormat("#define FOO(typeName, realClass)                           \\\n"
-               "  { #typeName, foo<FooType>(new foo<realClass>(#typeName)) }",
+               "  {#typeName, foo<FooType>(new foo<realClass>(#typeName))}",
                getLLVMStyleWithColumns(60));
 }
 
@@ -11347,35 +11354,31 @@ TEST_F(FormatTest, UnderstandsNewAndDelete) {
 
   FormatStyle AfterPlacementOperator = getLLVMStyle();
   AfterPlacementOperator.SpaceBeforeParens = FormatStyle::SBPO_Custom;
-  EXPECT_EQ(
-      AfterPlacementOperator.SpaceBeforeParensOptions.AfterPlacementOperator,
-      FormatStyle::SpaceBeforeParensCustom::APO_Leave);
+  EXPECT_TRUE(
+      AfterPlacementOperator.SpaceBeforeParensOptions.AfterPlacementOperator);
   verifyFormat("new (buf) int;", AfterPlacementOperator);
-  verifyFormat("new(buf) int;", AfterPlacementOperator);
-
-  AfterPlacementOperator.SpaceBeforeParensOptions.AfterPlacementOperator =
-      FormatStyle::SpaceBeforeParensCustom::APO_Never;
   verifyFormat("struct A {\n"
                "  int *a;\n"
-               "  A(int *p) : a(new(p) int) {\n"
-               "    new(p) int;\n"
-               "    int *b = new(p) int;\n"
-               "    int *c = new(p) int(3);\n"
-               "    delete(b);\n"
+               "  A(int *p) : a(new (p) int) {\n"
+               "    new (p) int;\n"
+               "    int *b = new (p) int;\n"
+               "    int *c = new (p) int(3);\n"
+               "    delete (b);\n"
                "  }\n"
                "};",
                AfterPlacementOperator);
   verifyFormat("void operator new(void *foo) ATTRIB;", AfterPlacementOperator);
 
   AfterPlacementOperator.SpaceBeforeParensOptions.AfterPlacementOperator =
-      FormatStyle::SpaceBeforeParensCustom::APO_Always;
+      false;
+  verifyFormat("new(buf) int;", AfterPlacementOperator);
   verifyFormat("struct A {\n"
                "  int *a;\n"
-               "  A(int *p) : a(new (p) int) {\n"
-               "    new (p) int;\n"
-               "    int *b = new (p) int;\n"
-               "    int *c = new (p) int(3);\n"
-               "    delete (b);\n"
+               "  A(int *p) : a(new(p) int) {\n"
+               "    new(p) int;\n"
+               "    int *b = new(p) int;\n"
+               "    int *c = new(p) int(3);\n"
+               "    delete(b);\n"
                "  }\n"
                "};",
                AfterPlacementOperator);
@@ -26860,6 +26863,7 @@ TEST_F(FormatTest, RemoveParentheses) {
   EXPECT_EQ(Style.RemoveParentheses, FormatStyle::RPS_Leave);
 
   Style.RemoveParentheses = FormatStyle::RPS_MultipleParentheses;
+  verifyFormat("#define Foo(...) foo((__VA_ARGS__))", Style);
   verifyFormat("int x __attribute__((aligned(16))) = 0;", Style);
   verifyFormat("decltype((foo->bar)) baz;", Style);
   verifyFormat("class __declspec(dllimport) X {};",
@@ -26894,6 +26898,7 @@ TEST_F(FormatTest, RemoveParentheses) {
   verifyFormat("return (({ 0; }));", "return ((({ 0; })));", Style);
 
   Style.RemoveParentheses = FormatStyle::RPS_ReturnStatement;
+  verifyFormat("#define Return0 return (0);", Style);
   verifyFormat("return 0;", "return (0);", Style);
   verifyFormat("co_return 0;", "co_return ((0));", Style);
   verifyFormat("return 0;", "return (((0)));", Style);
@@ -27021,10 +27026,16 @@ TEST_F(FormatTest, PPBranchesInBracedInit) {
                "};");
 }
 
-TEST_F(FormatTest, StreamOutputOperator) {
-  verifyFormat("std::cout << \"foo\" << \"bar\" << baz;");
-  verifyFormat("std::cout << \"foo\\n\"\n"
-               "          << \"bar\";");
+TEST_F(FormatTest, PPDirectivesAndCommentsInBracedInit) {
+  verifyFormat("{\n"
+               "  char *a[] = {\n"
+               "      /* abc */ \"abc\",\n"
+               "#if FOO\n"
+               "      /* xyz */ \"xyz\",\n"
+               "#endif\n"
+               "      /* last */ \"last\"};\n"
+               "}",
+               getLLVMStyleWithColumns(30));
 }
 
 TEST_F(FormatTest, BreakAdjacentStringLiterals) {
@@ -27041,6 +27052,7 @@ TEST_F(FormatTest, BreakAdjacentStringLiterals) {
   Style.BreakAdjacentStringLiterals = false;
   verifyFormat(Code, Style);
 }
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index c249f4d9333fd0..d7c432ed031d34 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -376,6 +376,10 @@ TEST_F(FormatTestComments, RemovesTrailingWhitespaceOfComments) {
 TEST_F(FormatTestComments, UnderstandsBlockComments) {
   verifyFormat("f(/*noSpaceAfterParameterNamingComment=*/true);");
   verifyFormat("void f() { g(/*aaa=*/x, /*bbb=*/!y, /*c=*/::c); }");
+  verifyFormat("fooooooooooooooooooooooooooooo(\n"
+               "    /*qq_=*/move(q), [this, b](bar<void(uint32_t)> b) {},\n"
+               "    c);",
+               getLLVMStyleWithColumns(60));
   EXPECT_EQ("f(aaaaaaaaaaaaaaaaaaaaaaaaa, /* Trailing comment for aa... */\n"
             "  bbbbbbbbbbbbbbbbbbbbbbbbb);",
             format("f(aaaaaaaaaaaaaaaaaaaaaaaaa ,   \\\n"
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 3dbf504c35ed55..44ebad9d5a872a 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -611,6 +611,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsCasts) {
   EXPECT_TOKEN(Tokens[13], tok::r_paren, TT_Unknown);
   EXPECT_TOKEN(Tokens[14], tok::star, TT_BinaryOperator);
 
+  Tokens = annotate("#define foo(i) ((i) - bar)");
+  ASSERT_EQ(Tokens.size(), 14u) << Tokens;
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_Unknown);
+  EXPECT_TOKEN(Tokens[10], tok::minus, TT_BinaryOperator);
+
   Tokens = annotate("return (Foo) & 10;");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::r_paren, TT_Unknown);
@@ -1867,6 +1872,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsTrailingReturnArrow) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::arrow, TT_Unknown);
 
+  Tokens = annotate("__attribute__((cold)) C() : Base(obj->func()) {}");
+  ASSERT_EQ(Tokens.size(), 21u) << Tokens;
+  EXPECT_TOKEN(Tokens[13], tok::arrow, TT_Unknown);
+
   // Mixed
   Tokens = annotate("auto f() -> int { auto a = b()->c; }");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
@@ -1880,14 +1889,20 @@ TEST_F(TokenAnnotatorTest, UnderstandHashInMacro) {
                          "    #Bar \\\n"
                          "  }");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
-  EXPECT_BRACE_KIND(Tokens[6], BK_Block);
-  EXPECT_BRACE_KIND(Tokens[9], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[6], BK_BracedInit);
+  EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit);
 
   Tokens = annotate("#define Foo(Bar) \\\n"
                     "  { #Bar }");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
-  EXPECT_BRACE_KIND(Tokens[6], BK_Block);
-  EXPECT_BRACE_KIND(Tokens[9], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[6], BK_BracedInit);
+  EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit);
+
+  Tokens = annotate("#define FOO(typeName, realClass) \\\n"
+                    "  {#typeName, foo<Foo>(new foo<realClass>(#typeName))}");
+  ASSERT_EQ(Tokens.size(), 29u) << Tokens;
+  EXPECT_BRACE_KIND(Tokens[8], BK_BracedInit);
+  EXPECT_BRACE_KIND(Tokens[27], BK_BracedInit);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsAttributeMacros) {
@@ -2590,15 +2605,33 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_FunctionLBrace);
   EXPECT_BRACE_KIND(Tokens[4], BK_Block);
   EXPECT_BRACE_KIND(Tokens[6], BK_Block);
-}
 
-TEST_F(TokenAnnotatorTest, StreamOperator) {
-  auto Tokens = annotate("\"foo\\n\" << aux << \"foo\\n\" << \"foo\";");
-  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
-  EXPECT_FALSE(Tokens[1]->MustBreakBefore);
-  EXPECT_FALSE(Tokens[3]->MustBreakBefore);
-  // Only break between string literals if the former ends with \n.
-  EXPECT_TRUE(Tokens[5]->MustBreakBefore);
+  Tokens = annotate("struct Foo {\n"
+                    "  Foo() {};\n"
+                    "  ~Foo() {};\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 19u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_CtorDtorDeclName);
+  EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_FunctionLBrace);
+  EXPECT_BRACE_KIND(Tokens[6], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[7], BK_Block);
+  EXPECT_TOKEN(Tokens[10], tok::identifier, TT_CtorDtorDeclName);
+  EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_FunctionLBrace);
+  EXPECT_BRACE_KIND(Tokens[13], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[14], BK_Block);
+
+  Tokens = annotate("{\n"
+                    "  char *a[] = {\n"
+                    "      /* abc */ \"abc\",\n"
+                    "#if FOO\n"
+                    "      /* xyz */ \"xyz\",\n"
+                    "#endif\n"
+                    "      /* last */ \"last\"};\n"
+                    "}");
+  ASSERT_EQ(Tokens.size(), 25u) << Tokens;
+  EXPECT_BRACE_KIND(Tokens[0], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[7], BK_BracedInit);
+  EXPECT_BRACE_KIND(Tokens[21], BK_BracedInit);
 }
 
 } // namespace
diff --git a/clang/unittests/Serialization/CMakeLists.txt b/clang/unittests/Serialization/CMakeLists.txt
index 10d7de970c643d..e7eebd0cb98239 100644
--- a/clang/unittests/Serialization/CMakeLists.txt
+++ b/clang/unittests/Serialization/CMakeLists.txt
@@ -10,6 +10,7 @@ add_clang_unittest(SerializationTests
   InMemoryModuleCacheTest.cpp
   ModuleCacheTest.cpp
   NoCommentsTest.cpp
+  PreambleInNamedModulesTest.cpp
   SourceLocationEncodingTest.cpp
   VarDeclConstantInitTest.cpp
   )
diff --git a/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp b/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp
new file mode 100644
index 00000000000000..d26e1cb633654f
--- /dev/null
+++ b/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp
@@ -0,0 +1,132 @@
+//===- unittests/Serialization/PreambleInNamedModulesTest.cpp -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Frontend/FrontendActions.h"
+#include "clang/Frontend/PrecompiledPreamble.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace clang;
+
+namespace {
+
+class PreambleInNamedModulesTest : public ::testing::Test {
+  void SetUp() override {
+    ASSERT_FALSE(sys::fs::createUniqueDirectory("modules-test", TestDir));
+  }
+
+  void TearDown() override { sys::fs::remove_directories(TestDir); }
+
+public:
+  using PathType = SmallString<256>;
+
+  PathType TestDir;
+
+  void addFile(StringRef Path, StringRef Contents, PathType &AbsPath) {
+    ASSERT_FALSE(sys::path::is_absolute(Path));
+
+    AbsPath = TestDir;
+    sys::path::append(AbsPath, Path);
+
+    ASSERT_FALSE(
+        sys::fs::create_directories(llvm::sys::path::parent_path(AbsPath)));
+
+    std::error_code EC;
+    llvm::raw_fd_ostream OS(AbsPath, EC);
+    ASSERT_FALSE(EC);
+    OS << Contents;
+  }
+
+  void addFile(StringRef Path, StringRef Contents) {
+    PathType UnusedAbsPath;
+    addFile(Path, Contents, UnusedAbsPath);
+  }
+};
+
+// Testing that the use of Preamble in named modules can work basically.
+// See https://github.com/llvm/llvm-project/issues/80570
+TEST_F(PreambleInNamedModulesTest, BasicTest) {
+  addFile("foo.h", R"cpp(
+enum class E {
+    A,
+    B,
+    C,
+    D
+};
+  )cpp");
+
+  PathType MainFilePath;
+  addFile("A.cppm", R"cpp(
+module;
+#include "foo.h"
+export module A;
+export using ::E;
+  )cpp",
+          MainFilePath);
+
+  IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
+      CompilerInstance::createDiagnostics(new DiagnosticOptions());
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS =
+      llvm::vfs::createPhysicalFileSystem();
+
+  CreateInvocationOptions CIOpts;
+  CIOpts.Diags = Diags;
+  CIOpts.VFS = VFS;
+
+  const char *Args[] = {"clang++", "-std=c++20", "-working-directory",
+                        TestDir.c_str(), MainFilePath.c_str()};
+  std::shared_ptr<CompilerInvocation> Invocation =
+      createInvocation(Args, CIOpts);
+  ASSERT_TRUE(Invocation);
+
+  llvm::ErrorOr<std::unique_ptr<MemoryBuffer>> ContentsBuffer =
+      llvm::MemoryBuffer::getFile(MainFilePath, /*IsText=*/true);
+  EXPECT_TRUE(ContentsBuffer);
+  std::unique_ptr<MemoryBuffer> Buffer = std::move(*ContentsBuffer);
+
+  PreambleBounds Bounds =
+      ComputePreambleBounds(Invocation->getLangOpts(), *Buffer, 0);
+
+  PreambleCallbacks Callbacks;
+  llvm::ErrorOr<PrecompiledPreamble> BuiltPreamble = PrecompiledPreamble::Build(
+      *Invocation, Buffer.get(), Bounds, *Diags, VFS,
+      std::make_shared<PCHContainerOperations>(),
+      /*StoreInMemory=*/false, /*StoragePath=*/TestDir, Callbacks);
+
+  ASSERT_FALSE(Diags->hasErrorOccurred());
+
+  EXPECT_TRUE(BuiltPreamble);
+  EXPECT_TRUE(BuiltPreamble->CanReuse(*Invocation, *Buffer, Bounds, *VFS));
+  BuiltPreamble->OverridePreamble(*Invocation, VFS, Buffer.get());
+
+  auto Clang = std::make_unique<CompilerInstance>(
+      std::make_shared<PCHContainerOperations>());
+  Clang->setInvocation(std::move(Invocation));
+  Clang->setDiagnostics(Diags.get());
+
+  if (auto VFSWithRemapping = createVFSFromCompilerInvocation(
+          Clang->getInvocation(), Clang->getDiagnostics(), VFS))
+    VFS = VFSWithRemapping;
+
+  Clang->createFileManager(VFS);
+  EXPECT_TRUE(Clang->createTarget());
+
+  Buffer.release();
+
+  SyntaxOnlyAction Action;
+  EXPECT_TRUE(Clang->ExecuteAction(Action));
+  EXPECT_FALSE(Clang->getDiagnosticsPtr()->hasErrorOccurred());
+}
+
+} // namespace
diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt
index c6d51863fb1b5c..93744f46060236 100644
--- a/clang/utils/perf-training/CMakeLists.txt
+++ b/clang/utils/perf-training/CMakeLists.txt
@@ -1,6 +1,10 @@
+include(LLVMExternalProjectUtils)
+
 set(CLANG_PGO_TRAINING_DATA "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH
   "The path to a lit testsuite containing samples for PGO and order file generation"
   )
+set(CLANG_PGO_TRAINING_DATA_SOURCE_DIR OFF CACHE STRING "Path to source directory containing cmake project with source files to use for generating pgo data")
+set(CLANG_PGO_TRAINING_DEPS "" CACHE STRING "Extra dependencies needed to build the PGO training data.")
 
 if(LLVM_BUILD_INSTRUMENTED)
   configure_lit_site_cfg(
@@ -11,11 +15,11 @@ if(LLVM_BUILD_INSTRUMENTED)
   add_lit_testsuite(generate-profraw "Generating clang PGO data"
     ${CMAKE_CURRENT_BINARY_DIR}/pgo-data/
     EXCLUDE_FROM_CHECK_ALL
-    DEPENDS clang clear-profraw ${CLANG_PERF_TRAINING_DEPS}
+    DEPENDS clang clear-profraw ${CLANG_PGO_TRAINING_DEPS}
     )
 
   add_custom_target(clear-profraw
-    COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} profraw
+    COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_BINARY_DIR}/profiles/ profraw
     COMMENT "Clearing old profraw data")
 
   if(NOT LLVM_PROFDATA)
@@ -26,9 +30,14 @@ if(LLVM_BUILD_INSTRUMENTED)
     message(STATUS "To enable merging PGO data LLVM_PROFDATA has to point to llvm-profdata")
   else()
     add_custom_target(generate-profdata
-      COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge ${LLVM_PROFDATA} ${CMAKE_CURRENT_BINARY_DIR}/clang.profdata ${CMAKE_CURRENT_BINARY_DIR}
+      COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge ${LLVM_PROFDATA} ${CMAKE_CURRENT_BINARY_DIR}/clang.profdata ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_BINARY_DIR}/profiles/
       COMMENT "Merging profdata"
       DEPENDS generate-profraw)
+    if (CLANG_PGO_TRAINING_DATA_SOURCE_DIR)
+      llvm_ExternalProject_Add(generate-profraw-external ${CLANG_PGO_TRAINING_DATA_SOURCE_DIR}
+              USE_TOOLCHAIN EXLUDE_FROM_ALL NO_INSTALL DEPENDS generate-profraw)
+      add_dependencies(generate-profdata generate-profraw-external)
+    endif()
   endif()
 endif()
 
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 99d6a3333b6ef0..3e92cd38a71451 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -30,26 +30,28 @@ def findFilesWithExtension(path, extension):
 
 
 def clean(args):
-    if len(args) != 2:
+    if len(args) < 2:
         print(
-            "Usage: %s clean <path> <extension>\n" % __file__
+            "Usage: %s clean <paths> <extension>\n" % __file__
             + "\tRemoves all files with extension from <path>."
         )
         return 1
-    for filename in findFilesWithExtension(args[0], args[1]):
-        os.remove(filename)
+    for path in args[1:-1]:
+        for filename in findFilesWithExtension(path, args[-1]):
+            os.remove(filename)
     return 0
 
 
 def merge(args):
-    if len(args) != 3:
+    if len(args) < 3:
         print(
-            "Usage: %s merge <llvm-profdata> <output> <path>\n" % __file__
+            "Usage: %s merge <llvm-profdata> <output> <paths>\n" % __file__
             + "\tMerges all profraw files from path into output."
         )
         return 1
     cmd = [args[0], "merge", "-o", args[1]]
-    cmd.extend(findFilesWithExtension(args[2], "profraw"))
+    for path in args[2:]:
+        cmd.extend(findFilesWithExtension(path, "profraw"))
     subprocess.check_call(cmd)
     return 0
 
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index f0d835fc091cea..c9d21f096a34c6 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -12630,7 +12630,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2137.html">2137</a></td>
     <td>CD4</td>
     <td>List-initialization from object of same type</td>
-    <td class="unreleased" align="center">Clang 18</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2138">
     <td><a href="https://cplusplus.github.io/CWG/issues/2138.html">2138</a></td>
@@ -13674,7 +13674,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2311.html">2311</a></td>
     <td>open</td>
     <td>Missed case for guaranteed copy elision</td>
-    <td class="unreleased" align="center">Clang 18</td>
+    <td align="center">Not resolved</td>
   </tr>
   <tr id="2312">
     <td><a href="https://cplusplus.github.io/CWG/issues/2312.html">2312</a></td>
diff --git a/compiler-rt/lib/builtins/divtc3.c b/compiler-rt/lib/builtins/divtc3.c
index e970cef574b21d..099de5802daf0e 100644
--- a/compiler-rt/lib/builtins/divtc3.c
+++ b/compiler-rt/lib/builtins/divtc3.c
@@ -13,7 +13,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_TF_MODE)
+#if defined(CRT_HAS_F128)
 
 // Returns: the quotient of (a + ib) / (c + id)
 
diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h
index af406e760497a4..c4f0a5b9587f77 100644
--- a/compiler-rt/lib/builtins/fp_lib.h
+++ b/compiler-rt/lib/builtins/fp_lib.h
@@ -22,6 +22,7 @@
 
 #include "int_lib.h"
 #include "int_math.h"
+#include "int_types.h"
 #include <limits.h>
 #include <stdbool.h>
 #include <stdint.h>
@@ -93,13 +94,14 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 COMPILER_RT_ABI fp_t __adddf3(fp_t a, fp_t b);
 
 #elif defined QUAD_PRECISION
-#if defined(CRT_HAS_TF_MODE)
+#if defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT)
 typedef uint64_t half_rep_t;
 typedef __uint128_t rep_t;
 typedef __int128_t srep_t;
 typedef tf_float fp_t;
 #define HALF_REP_C UINT64_C
 #define REP_C (__uint128_t)
+#if defined(CRT_HAS_IEEE_TF)
 // Note: Since there is no explicit way to tell compiler the constant is a
 // 128-bit integer, we let the constant be casted to 128-bit integer
 #define significandBits 112
@@ -188,7 +190,10 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 #undef Word_HiMask
 #undef Word_LoMask
 #undef Word_FullMask
-#endif // defined(CRT_HAS_TF_MODE)
+#endif // defined(CRT_HAS_IEEE_TF)
+#else
+typedef long double fp_t;
+#endif // defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT)
 #else
 #error SINGLE_PRECISION, DOUBLE_PRECISION or QUAD_PRECISION must be defined.
 #endif
@@ -196,19 +201,6 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 #if defined(SINGLE_PRECISION) || defined(DOUBLE_PRECISION) ||                  \
     (defined(QUAD_PRECISION) && defined(CRT_HAS_TF_MODE))
 #define typeWidth (sizeof(rep_t) * CHAR_BIT)
-#define exponentBits (typeWidth - significandBits - 1)
-#define maxExponent ((1 << exponentBits) - 1)
-#define exponentBias (maxExponent >> 1)
-
-#define implicitBit (REP_C(1) << significandBits)
-#define significandMask (implicitBit - 1U)
-#define signBit (REP_C(1) << (significandBits + exponentBits))
-#define absMask (signBit - 1U)
-#define exponentMask (absMask ^ significandMask)
-#define oneRep ((rep_t)exponentBias << significandBits)
-#define infRep exponentMask
-#define quietBit (implicitBit >> 1)
-#define qnanRep (exponentMask | quietBit)
 
 static __inline rep_t toRep(fp_t x) {
   const union {
@@ -226,6 +218,21 @@ static __inline fp_t fromRep(rep_t x) {
   return rep.f;
 }
 
+#if !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF)
+#define exponentBits (typeWidth - significandBits - 1)
+#define maxExponent ((1 << exponentBits) - 1)
+#define exponentBias (maxExponent >> 1)
+
+#define implicitBit (REP_C(1) << significandBits)
+#define significandMask (implicitBit - 1U)
+#define signBit (REP_C(1) << (significandBits + exponentBits))
+#define absMask (signBit - 1U)
+#define exponentMask (absMask ^ significandMask)
+#define oneRep ((rep_t)exponentBias << significandBits)
+#define infRep exponentMask
+#define quietBit (implicitBit >> 1)
+#define qnanRep (exponentMask | quietBit)
+
 static __inline int normalize(rep_t *significand) {
   const int shift = rep_clz(*significand) - rep_clz(implicitBit);
   *significand <<= shift;
@@ -328,6 +335,8 @@ static __inline fp_t __compiler_rt_scalbnX(fp_t x, int y) {
     return fromRep(sign | ((rep_t)exp << significandBits) | sig);
 }
 
+#endif // !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF)
+
 // Avoid using fmax from libm.
 static __inline fp_t __compiler_rt_fmaxX(fp_t x, fp_t y) {
   // If either argument is NaN, return the other argument. If both are NaN,
@@ -405,6 +414,8 @@ static __inline tf_float __compiler_rt_fmaxtf(tf_float x, tf_float y) {
 #define __compiler_rt_logbl crt_logbl
 #define __compiler_rt_scalbnl crt_scalbnl
 #define __compiler_rt_fmaxl crt_fmaxl
+#define crt_fabstf crt_fabsl
+#define crt_copysigntf crt_copysignl
 #else
 #error Unsupported TF mode type
 #endif
diff --git a/compiler-rt/lib/builtins/i386/chkstk.S b/compiler-rt/lib/builtins/i386/chkstk.S
index a84bb0ee300705..cdd9a4c2a57522 100644
--- a/compiler-rt/lib/builtins/i386/chkstk.S
+++ b/compiler-rt/lib/builtins/i386/chkstk.S
@@ -14,7 +14,6 @@
 .text
 .balign 4
 DEFINE_COMPILERRT_FUNCTION(_alloca) // _chkstk and _alloca are the same function
-DEFINE_COMPILERRT_FUNCTION(_chkstk)
         push   %ecx
         cmp    $0x1000,%eax
         lea    8(%esp),%ecx     // esp before calling this routine -> ecx
@@ -35,7 +34,6 @@ DEFINE_COMPILERRT_FUNCTION(_chkstk)
         push   (%eax)           // push return address onto the stack
         sub    %esp,%eax        // restore the original value in eax
         ret
-END_COMPILERRT_FUNCTION(_chkstk)
 END_COMPILERRT_FUNCTION(_alloca)
 
 #endif // __i386__
diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h
index 7624c728061518..ca97391fc28466 100644
--- a/compiler-rt/lib/builtins/int_types.h
+++ b/compiler-rt/lib/builtins/int_types.h
@@ -189,12 +189,16 @@ typedef long double tf_float;
 #define CRT_LDBL_IEEE_F128
 #endif
 #define TF_C(x) x##L
-#elif __LDBL_MANT_DIG__ == 113
-// Use long double instead of __float128 if it matches the IEEE 128-bit format.
+#elif __LDBL_MANT_DIG__ == 113 ||                                              \
+    (__FLT_RADIX__ == 16 && __LDBL_MANT_DIG__ == 28)
+// Use long double instead of __float128 if it matches the IEEE 128-bit format
+// or the IBM hexadecimal format.
 #define CRT_LDBL_128BIT
 #define CRT_HAS_F128
+#if __LDBL_MANT_DIG__ == 113
 #define CRT_HAS_IEEE_TF
 #define CRT_LDBL_IEEE_F128
+#endif
 typedef long double tf_float;
 #define TF_C(x) x##L
 #elif defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
diff --git a/compiler-rt/lib/builtins/multc3.c b/compiler-rt/lib/builtins/multc3.c
index f20e53ccbf233b..61a3f45e47279c 100644
--- a/compiler-rt/lib/builtins/multc3.c
+++ b/compiler-rt/lib/builtins/multc3.c
@@ -15,7 +15,7 @@
 #include "int_lib.h"
 #include "int_math.h"
 
-#if defined(CRT_HAS_TF_MODE)
+#if defined(CRT_HAS_F128)
 
 // Returns: the product of a + ib and c + id
 
diff --git a/compiler-rt/lib/builtins/riscv/restore.S b/compiler-rt/lib/builtins/riscv/restore.S
index 73f64a920d6698..6f43842c8ca684 100644
--- a/compiler-rt/lib/builtins/riscv/restore.S
+++ b/compiler-rt/lib/builtins/riscv/restore.S
@@ -22,6 +22,8 @@
 
 #if __riscv_xlen == 32
 
+#ifndef __riscv_32e
+
   .globl  __riscv_restore_12
   .type   __riscv_restore_12,@function
 __riscv_restore_12:
@@ -86,8 +88,29 @@ __riscv_restore_0:
   addi    sp, sp, 16
   ret
 
+#else
+
+  .globl  __riscv_restore_2
+  .type   __riscv_restore_2,@function
+  .globl  __riscv_restore_1
+  .type   __riscv_restore_1,@function
+  .globl  __riscv_restore_0
+  .type   __riscv_restore_0,@function
+__riscv_restore_2:
+__riscv_restore_1:
+__riscv_restore_0:
+  lw      s1,  0(sp)
+  lw      s0,  4(sp)
+  lw      ra,  8(sp)
+  addi    sp, sp, 12
+  ret
+
+#endif
+
 #elif __riscv_xlen == 64
 
+#ifndef __riscv_64e
+
   .globl  __riscv_restore_12
   .type   __riscv_restore_12,@function
 __riscv_restore_12:
@@ -161,6 +184,25 @@ __riscv_restore_0:
   addi    sp, sp, 16
   ret
 
+#else
+
+  .globl  __riscv_restore_2
+  .type   __riscv_restore_2,@function
+  .globl  __riscv_restore_1
+  .type   __riscv_restore_1,@function
+  .globl  __riscv_restore_0
+  .type   __riscv_restore_0,@function
+__riscv_restore_2:
+__riscv_restore_1:
+__riscv_restore_0:
+  ld      s1,  0(sp)
+  ld      s0,  8(sp)
+  ld      ra,  16(sp)
+  addi    sp, sp, 24
+  ret
+
+#endif
+
 #else
 # error "xlen must be 32 or 64 for save-restore implementation
 #endif
diff --git a/compiler-rt/lib/builtins/riscv/save.S b/compiler-rt/lib/builtins/riscv/save.S
index 85501aeb4c2e93..3e044179ff7f1d 100644
--- a/compiler-rt/lib/builtins/riscv/save.S
+++ b/compiler-rt/lib/builtins/riscv/save.S
@@ -18,6 +18,8 @@
 
 #if __riscv_xlen == 32
 
+#ifndef __riscv_32e
+
   .globl  __riscv_save_12
   .type   __riscv_save_12,@function
 __riscv_save_12:
@@ -92,8 +94,29 @@ __riscv_save_0:
   sw      ra,  12(sp)
   jr      t0
 
+#else
+
+  .globl  __riscv_save_2
+  .type   __riscv_save_2,@function
+  .globl  __riscv_save_1
+  .type   __riscv_save_1,@function
+  .globl  __riscv_save_0
+  .type   __riscv_save_0,@function
+__riscv_save_2:
+__riscv_save_1:
+__riscv_save_0:
+  addi    sp, sp, -12
+  sw      s1,  0(sp)
+  sw      s0,  4(sp)
+  sw      ra,  8(sp)
+  jr      t0
+
+#endif
+
 #elif __riscv_xlen == 64
 
+#ifndef __riscv_64e
+
   .globl  __riscv_save_12
   .type   __riscv_save_12,@function
 __riscv_save_12:
@@ -181,6 +204,25 @@ __riscv_save_0:
   sd     ra, 8(sp)
   jr     t0
 
+#else
+
+  .globl  __riscv_save_2
+  .type   __riscv_save_2,@function
+  .globl  __riscv_save_1
+  .type   __riscv_save_1,@function
+  .globl  __riscv_save_0
+  .type   __riscv_save_0,@function
+__riscv_save_2:
+__riscv_save_1:
+__riscv_save_0:
+  addi   sp, sp, -24
+  sd     s1, 0(sp)
+  sd     s0, 8(sp)
+  sd     ra, 16(sp)
+  jr     t0
+
+#endif
+
 #else
 # error "xlen must be 32 or 64 for save-restore implementation
 #endif
diff --git a/compiler-rt/lib/builtins/x86_64/chkstk.S b/compiler-rt/lib/builtins/x86_64/chkstk.S
index 494ee261193bc7..ad7953a116ac7e 100644
--- a/compiler-rt/lib/builtins/x86_64/chkstk.S
+++ b/compiler-rt/lib/builtins/x86_64/chkstk.S
@@ -18,7 +18,6 @@
 .text
 .balign 4
 DEFINE_COMPILERRT_FUNCTION(___chkstk_ms)
-DEFINE_COMPILERRT_FUNCTION(__chkstk)
         push   %rcx
         push   %rax
         cmp    $0x1000,%rax
@@ -36,7 +35,6 @@ DEFINE_COMPILERRT_FUNCTION(__chkstk)
         pop    %rax
         pop    %rcx
         ret
-END_COMPILERRT_FUNCTION(__chkstk)
 END_COMPILERRT_FUNCTION(___chkstk_ms)
 
 #endif // __x86_64__
diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp
index 85b796bd6349c8..3af26e9f64c925 100644
--- a/compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -55,6 +55,10 @@ using namespace __dfsan;
 #define DECLARE_WEAK_INTERCEPTOR_HOOK(f, ...) \
 SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void f(__VA_ARGS__);
 
+#define WRAPPER_ALIAS(fun, real)                                          \
+  SANITIZER_INTERFACE_ATTRIBUTE void __dfsw_##fun() ALIAS(__dfsw_##real); \
+  SANITIZER_INTERFACE_ATTRIBUTE void __dfso_##fun() ALIAS(__dfso_##real);
+
 // Async-safe, non-reentrant spin lock.
 class SignalSpinLocker {
  public:
@@ -1197,16 +1201,20 @@ char *__dfso_strcpy(char *dest, const char *src, dfsan_label dst_label,
   *ret_origin = dst_origin;
   return ret;
 }
+}
 
-static long int dfsan_strtol(const char *nptr, char **endptr, int base,
-                             char **tmp_endptr) {
+template <typename Fn>
+static ALWAYS_INLINE auto dfsan_strtol_impl(
+    Fn real, const char *nptr, char **endptr, int base,
+    char **tmp_endptr) -> decltype(real(nullptr, nullptr, 0)) {
   assert(tmp_endptr);
-  long int ret = strtol(nptr, tmp_endptr, base);
+  auto ret = real(nptr, tmp_endptr, base);
   if (endptr)
     *endptr = *tmp_endptr;
   return ret;
 }
 
+extern "C" {
 static void dfsan_strtolong_label(const char *nptr, const char *tmp_endptr,
                                   dfsan_label base_label,
                                   dfsan_label *ret_label) {
@@ -1236,30 +1244,6 @@ static void dfsan_strtolong_origin(const char *nptr, const char *tmp_endptr,
   }
 }
 
-SANITIZER_INTERFACE_ATTRIBUTE
-long int __dfsw_strtol(const char *nptr, char **endptr, int base,
-                       dfsan_label nptr_label, dfsan_label endptr_label,
-                       dfsan_label base_label, dfsan_label *ret_label) {
-  char *tmp_endptr;
-  long int ret = dfsan_strtol(nptr, endptr, base, &tmp_endptr);
-  dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);
-  return ret;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-long int __dfso_strtol(const char *nptr, char **endptr, int base,
-                       dfsan_label nptr_label, dfsan_label endptr_label,
-                       dfsan_label base_label, dfsan_label *ret_label,
-                       dfsan_origin nptr_origin, dfsan_origin endptr_origin,
-                       dfsan_origin base_origin, dfsan_origin *ret_origin) {
-  char *tmp_endptr;
-  long int ret = dfsan_strtol(nptr, endptr, base, &tmp_endptr);
-  dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);
-  dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label, base_origin,
-                         ret_origin);
-  return ret;
-}
-
 static double dfsan_strtod(const char *nptr, char **endptr, char **tmp_endptr) {
   assert(tmp_endptr);
   double ret = strtod(nptr, tmp_endptr);
@@ -1307,108 +1291,40 @@ double __dfso_strtod(const char *nptr, char **endptr, dfsan_label nptr_label,
   return ret;
 }
 
-static long long int dfsan_strtoll(const char *nptr, char **endptr, int base,
-                                   char **tmp_endptr) {
-  assert(tmp_endptr);
-  long long int ret = strtoll(nptr, tmp_endptr, base);
-  if (endptr)
-    *endptr = *tmp_endptr;
-  return ret;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-long long int __dfsw_strtoll(const char *nptr, char **endptr, int base,
-                             dfsan_label nptr_label, dfsan_label endptr_label,
-                             dfsan_label base_label, dfsan_label *ret_label) {
-  char *tmp_endptr;
-  long long int ret = dfsan_strtoll(nptr, endptr, base, &tmp_endptr);
-  dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);
-  return ret;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-long long int __dfso_strtoll(const char *nptr, char **endptr, int base,
-                             dfsan_label nptr_label, dfsan_label endptr_label,
-                             dfsan_label base_label, dfsan_label *ret_label,
-                             dfsan_origin nptr_origin,
-                             dfsan_origin endptr_origin,
-                             dfsan_origin base_origin,
-                             dfsan_origin *ret_origin) {
-  char *tmp_endptr;
-  long long int ret = dfsan_strtoll(nptr, endptr, base, &tmp_endptr);
-  dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);
-  dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label, base_origin,
-                         ret_origin);
-  return ret;
-}
-
-static unsigned long int dfsan_strtoul(const char *nptr, char **endptr,
-                                       int base, char **tmp_endptr) {
-  assert(tmp_endptr);
-  unsigned long int ret = strtoul(nptr, tmp_endptr, base);
-  if (endptr)
-    *endptr = *tmp_endptr;
-  return ret;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-unsigned long int __dfsw_strtoul(const char *nptr, char **endptr, int base,
-                       dfsan_label nptr_label, dfsan_label endptr_label,
-                       dfsan_label base_label, dfsan_label *ret_label) {
-  char *tmp_endptr;
-  unsigned long int ret = dfsan_strtoul(nptr, endptr, base, &tmp_endptr);
-  dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);
-  return ret;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-unsigned long int __dfso_strtoul(
-    const char *nptr, char **endptr, int base, dfsan_label nptr_label,
-    dfsan_label endptr_label, dfsan_label base_label, dfsan_label *ret_label,
-    dfsan_origin nptr_origin, dfsan_origin endptr_origin,
-    dfsan_origin base_origin, dfsan_origin *ret_origin) {
-  char *tmp_endptr;
-  unsigned long int ret = dfsan_strtoul(nptr, endptr, base, &tmp_endptr);
-  dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);
-  dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label, base_origin,
-                         ret_origin);
-  return ret;
-}
-
-static long long unsigned int dfsan_strtoull(const char *nptr, char **endptr,
-                                             int base, char **tmp_endptr) {
-  assert(tmp_endptr);
-  long long unsigned int ret = strtoull(nptr, tmp_endptr, base);
-  if (endptr)
-    *endptr = *tmp_endptr;
-  return ret;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-long long unsigned int __dfsw_strtoull(const char *nptr, char **endptr,
-                                       int base, dfsan_label nptr_label,
-                                       dfsan_label endptr_label,
-                                       dfsan_label base_label,
-                                       dfsan_label *ret_label) {
-  char *tmp_endptr;
-  long long unsigned int ret = dfsan_strtoull(nptr, endptr, base, &tmp_endptr);
-  dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);
-  return ret;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-long long unsigned int __dfso_strtoull(
-    const char *nptr, char **endptr, int base, dfsan_label nptr_label,
-    dfsan_label endptr_label, dfsan_label base_label, dfsan_label *ret_label,
-    dfsan_origin nptr_origin, dfsan_origin endptr_origin,
-    dfsan_origin base_origin, dfsan_origin *ret_origin) {
-  char *tmp_endptr;
-  long long unsigned int ret = dfsan_strtoull(nptr, endptr, base, &tmp_endptr);
-  dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);
-  dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label, base_origin,
-                         ret_origin);
-  return ret;
-}
+WRAPPER_ALIAS(__isoc23_strtod, strtod)
+
+#define WRAPPER_STRTO(ret_type, fun)                                     \
+  SANITIZER_INTERFACE_ATTRIBUTE ret_type __dfsw_##fun(                   \
+      const char *nptr, char **endptr, int base, dfsan_label nptr_label, \
+      dfsan_label endptr_label, dfsan_label base_label,                  \
+      dfsan_label *ret_label) {                                          \
+    char *tmp_endptr;                                                    \
+    auto ret = dfsan_strtol_impl(fun, nptr, endptr, base, &tmp_endptr);  \
+    dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);      \
+    return ret;                                                          \
+  }                                                                      \
+  SANITIZER_INTERFACE_ATTRIBUTE ret_type __dfso_##fun(                   \
+      const char *nptr, char **endptr, int base, dfsan_label nptr_label, \
+      dfsan_label endptr_label, dfsan_label base_label,                  \
+      dfsan_label *ret_label, dfsan_origin nptr_origin,                  \
+      dfsan_origin endptr_origin, dfsan_origin base_origin,              \
+      dfsan_origin *ret_origin) {                                        \
+    char *tmp_endptr;                                                    \
+    auto ret = dfsan_strtol_impl(fun, nptr, endptr, base, &tmp_endptr);  \
+    dfsan_strtolong_label(nptr, tmp_endptr, base_label, ret_label);      \
+    dfsan_strtolong_origin(nptr, tmp_endptr, base_label, ret_label,      \
+                           base_origin, ret_origin);                     \
+    return ret;                                                          \
+  }
+
+WRAPPER_STRTO(long, strtol)
+WRAPPER_STRTO(long long, strtoll)
+WRAPPER_STRTO(unsigned long, strtoul)
+WRAPPER_STRTO(unsigned long long, strtoull)
+WRAPPER_ALIAS(__isoc23_strtol, strtol)
+WRAPPER_ALIAS(__isoc23_strtoll, strtoll)
+WRAPPER_ALIAS(__isoc23_strtoul, strtoul)
+WRAPPER_ALIAS(__isoc23_strtoull, strtoull)
 
 SANITIZER_INTERFACE_ATTRIBUTE
 time_t __dfsw_time(time_t *t, dfsan_label t_label, dfsan_label *ret_label) {
@@ -2231,7 +2147,7 @@ SANITIZER_INTERFACE_ATTRIBUTE int __dfso_write(
   *ret_label = 0;
   return write(fd, buf, count);
 }
-} // namespace __dfsan
+}  // namespace __dfsan
 
 // Type used to extract a dfsan_label with va_arg()
 typedef int dfsan_label_va;
@@ -2866,31 +2782,8 @@ int __dfso_sscanf(char *str, const char *format, dfsan_label str_label,
   return ret;
 }
 
-SANITIZER_INTERFACE_ATTRIBUTE
-int __dfsw___isoc99_sscanf(char *str, const char *format, dfsan_label str_label,
-                           dfsan_label format_label, dfsan_label *va_labels,
-                           dfsan_label *ret_label, ...) {
-  va_list ap;
-  va_start(ap, ret_label);
-  int ret = scan_buffer(str, ~0ul, format, va_labels, ret_label, nullptr,
-                        nullptr, ap);
-  va_end(ap);
-  return ret;
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
-int __dfso___isoc99_sscanf(char *str, const char *format, dfsan_label str_label,
-                           dfsan_label format_label, dfsan_label *va_labels,
-                           dfsan_label *ret_label, dfsan_origin str_origin,
-                           dfsan_origin format_origin, dfsan_origin *va_origins,
-                           dfsan_origin *ret_origin, ...) {
-  va_list ap;
-  va_start(ap, ret_origin);
-  int ret = scan_buffer(str, ~0ul, format, va_labels, ret_label, &str_origin,
-                        ret_origin, ap);
-  va_end(ap);
-  return ret;
-}
+WRAPPER_ALIAS(__isoc99_sscanf, sscanf)
+WRAPPER_ALIAS(__isoc23_sscanf, sscanf)
 
 static void BeforeFork() {
   StackDepotLockBeforeFork();
diff --git a/compiler-rt/lib/dfsan/done_abilist.txt b/compiler-rt/lib/dfsan/done_abilist.txt
index c582584d77e45f..86a42ee1b4dce8 100644
--- a/compiler-rt/lib/dfsan/done_abilist.txt
+++ b/compiler-rt/lib/dfsan/done_abilist.txt
@@ -270,6 +270,11 @@ fun:strtoul=custom
 fun:strtoull=custom
 fun:strcat=custom
 fun:strncat=custom
+fun:__isoc23_strtod=custom
+fun:__isoc23_strtol=custom
+fun:__isoc23_strtoll=custom
+fun:__isoc23_strtoul=custom
+fun:__isoc23_strtoull=custom
 
 # Functions that produce an output that is computed from the input, but is not
 # necessarily data dependent.
@@ -311,6 +316,7 @@ fun:snprintf=custom
 # scanf-like
 fun:sscanf=custom
 fun:__isoc99_sscanf=custom
+fun:__isoc23_sscanf=custom
 
 # TODO: custom
 fun:asprintf=discard
diff --git a/compiler-rt/lib/dfsan/libc_ubuntu1404_abilist.txt b/compiler-rt/lib/dfsan/libc_ubuntu1404_abilist.txt
index 433092e2b27b8c..9ffa56a238185f 100644
--- a/compiler-rt/lib/dfsan/libc_ubuntu1404_abilist.txt
+++ b/compiler-rt/lib/dfsan/libc_ubuntu1404_abilist.txt
@@ -1,3 +1,8 @@
+fun:__isoc23_sscanf=uninstrumented
+fun:__isoc23_strtol=uninstrumented
+fun:__isoc23_strtoll=uninstrumented
+fun:__isoc23_strtoul=uninstrumented
+fun:__isoc23_strtoull=uninstrumented
 fun:_Exit=uninstrumented
 fun:_IO_adjust_column=uninstrumented
 fun:_IO_adjust_wcolumn=uninstrumented
diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp
index 3cdf10c149902c..a2fc27de1901b4 100644
--- a/compiler-rt/lib/msan/msan.cpp
+++ b/compiler-rt/lib/msan/msan.cpp
@@ -467,7 +467,7 @@ void __msan_init() {
   __msan_clear_on_return();
   if (__msan_get_track_origins())
     VPrintf(1, "msan_track_origins\n");
-  if (!InitShadow(__msan_get_track_origins())) {
+  if (!InitShadowWithReExec(__msan_get_track_origins())) {
     Printf("FATAL: MemorySanitizer can not mmap the shadow memory.\n");
     Printf("FATAL: Make sure to compile with -fPIE and to link with -pie.\n");
     Printf("FATAL: Disabling ASLR is known to cause this error.\n");
diff --git a/compiler-rt/lib/msan/msan.h b/compiler-rt/lib/msan/msan.h
index 710447a3e1a357..7fb58be67a02cd 100644
--- a/compiler-rt/lib/msan/msan.h
+++ b/compiler-rt/lib/msan/msan.h
@@ -33,12 +33,18 @@ struct MappingDesc {
   uptr start;
   uptr end;
   enum Type {
-    INVALID, APP, SHADOW, ORIGIN
+    INVALID = 1,
+    ALLOCATOR = 2,
+    APP = 4,
+    SHADOW = 8,
+    ORIGIN = 16,
   } type;
   const char *name;
 };
 
-
+// Note: MappingDesc::ALLOCATOR entries are only used to check for memory
+// layout compatibility. The actual allocation settings are in
+// msan_allocator.cpp, which need to be kept in sync.
 #if SANITIZER_LINUX && defined(__mips64)
 
 // MIPS64 maps:
@@ -84,7 +90,8 @@ const MappingDesc kMemoryLayout[] = {
     {0X0B00000000000, 0X0C00000000000, MappingDesc::SHADOW, "shadow-10-13"},
     {0X0C00000000000, 0X0D00000000000, MappingDesc::INVALID, "invalid"},
     {0X0D00000000000, 0X0E00000000000, MappingDesc::ORIGIN, "origin-10-13"},
-    {0X0E00000000000, 0X1000000000000, MappingDesc::APP, "app-15"},
+    {0x0E00000000000, 0x0E40000000000, MappingDesc::ALLOCATOR, "allocator"},
+    {0X0E40000000000, 0X1000000000000, MappingDesc::APP, "app-15"},
 };
 # define MEM_TO_SHADOW(mem) ((uptr)mem ^ 0xB00000000000ULL)
 # define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x200000000000ULL)
@@ -106,7 +113,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x510000000000ULL, 0x600000000000ULL, MappingDesc::APP, "app-2"},
     {0x600000000000ULL, 0x610000000000ULL, MappingDesc::ORIGIN, "origin-1"},
     {0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x700000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
+    {0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
 #  define MEM_TO_SHADOW(mem) (((uptr)(mem)) ^ 0x500000000000ULL)
 #  define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x100000000000ULL)
 
@@ -118,7 +126,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x180200000000ULL, 0x1C0000000000ULL, MappingDesc::INVALID, "invalid"},
     {0x1C0000000000ULL, 0x2C0200000000ULL, MappingDesc::ORIGIN, "origin"},
     {0x2C0200000000ULL, 0x300000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x300000000000ULL, 0x800000000000ULL, MappingDesc::APP, "high memory"}};
+    {0x300000000000ULL, 0x320000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x320000000000ULL, 0x800000000000ULL, MappingDesc::APP, "high memory"}};
 
 // Various kernels use different low end ranges but we can combine them into one
 // big range. They also use different high end ranges but we can map them all to
@@ -141,7 +150,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x180000000000ULL, 0x1C0000000000ULL, MappingDesc::INVALID, "invalid"},
     {0x1C0000000000ULL, 0x2C0000000000ULL, MappingDesc::ORIGIN, "origin"},
     {0x2C0000000000ULL, 0x440000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x440000000000ULL, 0x500000000000ULL, MappingDesc::APP, "high memory"}};
+    {0x440000000000ULL, 0x460000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x460000000000ULL, 0x500000000000ULL, MappingDesc::APP, "high memory"}};
 
 #define MEM_TO_SHADOW(mem) \
   ((((uptr)(mem)) & ~0xC00000000000ULL) + 0x080000000000ULL)
@@ -208,7 +218,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x510000000000ULL, 0x600000000000ULL, MappingDesc::APP, "app-2"},
     {0x600000000000ULL, 0x610000000000ULL, MappingDesc::ORIGIN, "origin-1"},
     {0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x700000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
+    {0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
 #define MEM_TO_SHADOW(mem) (((uptr)(mem)) ^ 0x500000000000ULL)
 #define SHADOW_TO_ORIGIN(mem) (((uptr)(mem)) + 0x100000000000ULL)
 
@@ -223,20 +234,22 @@ const uptr kMemoryLayoutSize = sizeof(kMemoryLayout) / sizeof(kMemoryLayout[0]);
 #ifndef __clang__
 __attribute__((optimize("unroll-loops")))
 #endif
-inline bool addr_is_type(uptr addr, MappingDesc::Type mapping_type) {
+inline bool
+addr_is_type(uptr addr, int mapping_types) {
 // It is critical for performance that this loop is unrolled (because then it is
 // simplified into just a few constant comparisons).
 #ifdef __clang__
 #pragma unroll
 #endif
   for (unsigned i = 0; i < kMemoryLayoutSize; ++i)
-    if (kMemoryLayout[i].type == mapping_type &&
+    if ((kMemoryLayout[i].type & mapping_types) &&
         addr >= kMemoryLayout[i].start && addr < kMemoryLayout[i].end)
       return true;
   return false;
 }
 
-#define MEM_IS_APP(mem) addr_is_type((uptr)(mem), MappingDesc::APP)
+#define MEM_IS_APP(mem) \
+  (addr_is_type((uptr)(mem), MappingDesc::APP | MappingDesc::ALLOCATOR))
 #define MEM_IS_SHADOW(mem) addr_is_type((uptr)(mem), MappingDesc::SHADOW)
 #define MEM_IS_ORIGIN(mem) addr_is_type((uptr)(mem), MappingDesc::ORIGIN)
 
@@ -250,7 +263,7 @@ extern bool msan_init_is_running;
 extern int msan_report_count;
 
 bool ProtectRange(uptr beg, uptr end);
-bool InitShadow(bool init_origins);
+bool InitShadowWithReExec(bool init_origins);
 char *GetProcSelfMaps();
 void InitializeInterceptors();
 
diff --git a/compiler-rt/lib/msan/msan_allocator.cpp b/compiler-rt/lib/msan/msan_allocator.cpp
index 0b2dd2b2f1883d..b1bc5b9390f75b 100644
--- a/compiler-rt/lib/msan/msan_allocator.cpp
+++ b/compiler-rt/lib/msan/msan_allocator.cpp
@@ -48,6 +48,9 @@ struct MsanMapUnmapCallback {
   }
 };
 
+// Note: to ensure that the allocator is compatible with the application memory
+// layout (especially with high-entropy ASLR), kSpaceBeg and kSpaceSize must be
+// duplicated as MappingDesc::ALLOCATOR in msan.h.
 #if defined(__mips64)
 static const uptr kMaxAllowedMallocSize = 2UL << 30;
 
diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp
index c7ecb7cad56661..cd2d9f5c720c57 100644
--- a/compiler-rt/lib/msan/msan_linux.cpp
+++ b/compiler-rt/lib/msan/msan_linux.cpp
@@ -20,6 +20,9 @@
 #  include <signal.h>
 #  include <stdio.h>
 #  include <stdlib.h>
+#  if SANITIZER_LINUX
+#    include <sys/personality.h>
+#  endif
 #  include <sys/resource.h>
 #  include <sys/time.h>
 #  include <unistd.h>
@@ -43,11 +46,13 @@ void ReportMapRange(const char *descr, uptr beg, uptr size) {
   }
 }
 
-static bool CheckMemoryRangeAvailability(uptr beg, uptr size) {
+static bool CheckMemoryRangeAvailability(uptr beg, uptr size, bool verbose) {
   if (size > 0) {
     uptr end = beg + size - 1;
     if (!MemoryRangeIsAvailable(beg, end)) {
-      Printf("FATAL: Memory range 0x%zx - 0x%zx is not available.\n", beg, end);
+      if (verbose)
+        Printf("FATAL: Memory range 0x%zx - 0x%zx is not available.\n", beg,
+               end);
       return false;
     }
   }
@@ -86,7 +91,7 @@ static void CheckMemoryLayoutSanity() {
     CHECK(addr_is_type(start, type));
     CHECK(addr_is_type((start + end) / 2, type));
     CHECK(addr_is_type(end - 1, type));
-    if (type == MappingDesc::APP) {
+    if (type == MappingDesc::APP || type == MappingDesc::ALLOCATOR) {
       uptr addr = start;
       CHECK(MEM_IS_SHADOW(MEM_TO_SHADOW(addr)));
       CHECK(MEM_IS_ORIGIN(MEM_TO_ORIGIN(addr)));
@@ -106,7 +111,7 @@ static void CheckMemoryLayoutSanity() {
   }
 }
 
-bool InitShadow(bool init_origins) {
+static bool InitShadow(bool init_origins, bool dry_run) {
   // Let user know mapping parameters first.
   VPrintf(1, "__msan_init %p\n", reinterpret_cast<void *>(&__msan_init));
   for (unsigned i = 0; i < kMemoryLayoutSize; ++i)
@@ -116,8 +121,9 @@ bool InitShadow(bool init_origins) {
   CheckMemoryLayoutSanity();
 
   if (!MEM_IS_APP(&__msan_init)) {
-    Printf("FATAL: Code %p is out of application range. Non-PIE build?\n",
-           reinterpret_cast<void *>(&__msan_init));
+    if (!dry_run)
+      Printf("FATAL: Code %p is out of application range. Non-PIE build?\n",
+             reinterpret_cast<void *>(&__msan_init));
     return false;
   }
 
@@ -138,20 +144,26 @@ bool InitShadow(bool init_origins) {
     bool protect = type == MappingDesc::INVALID ||
                    (!init_origins && type == MappingDesc::ORIGIN);
     CHECK(!(map && protect));
-    if (!map && !protect)
-      CHECK(type == MappingDesc::APP);
+    if (!map && !protect) {
+      CHECK(type == MappingDesc::APP || type == MappingDesc::ALLOCATOR);
+
+      if (dry_run && type == MappingDesc::ALLOCATOR &&
+          !CheckMemoryRangeAvailability(start, size, !dry_run))
+        return false;
+    }
     if (map) {
-      if (!CheckMemoryRangeAvailability(start, size))
+      if (dry_run && !CheckMemoryRangeAvailability(start, size, !dry_run))
         return false;
-      if (!MmapFixedSuperNoReserve(start, size, kMemoryLayout[i].name))
+      if (!dry_run &&
+          !MmapFixedSuperNoReserve(start, size, kMemoryLayout[i].name))
         return false;
-      if (common_flags()->use_madv_dontdump)
+      if (!dry_run && common_flags()->use_madv_dontdump)
         DontDumpShadowMemory(start, size);
     }
     if (protect) {
-      if (!CheckMemoryRangeAvailability(start, size))
+      if (dry_run && !CheckMemoryRangeAvailability(start, size, !dry_run))
         return false;
-      if (!ProtectMemoryRange(start, size, kMemoryLayout[i].name))
+      if (!dry_run && !ProtectMemoryRange(start, size, kMemoryLayout[i].name))
         return false;
     }
   }
@@ -159,6 +171,35 @@ bool InitShadow(bool init_origins) {
   return true;
 }
 
+bool InitShadowWithReExec(bool init_origins) {
+  // Start with dry run: check layout is ok, but don't print warnings because
+  // warning messages will cause tests to fail (even if we successfully re-exec
+  // after the warning).
+  bool success = InitShadow(__msan_get_track_origins(), true);
+  if (!success) {
+#  if SANITIZER_LINUX
+    // Perhaps ASLR entropy is too high. If ASLR is enabled, re-exec without it.
+    int old_personality = personality(0xffffffff);
+    bool aslr_on =
+        (old_personality != -1) && ((old_personality & ADDR_NO_RANDOMIZE) == 0);
+
+    if (aslr_on) {
+      VReport(1,
+              "WARNING: MemorySanitizer: memory layout is incompatible, "
+              "possibly due to high-entropy ASLR.\n"
+              "Re-execing with fixed virtual address space.\n"
+              "N.B. reducing ASLR entropy is preferable.\n");
+      CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
+      ReExec();
+    }
+#  endif
+  }
+
+  // The earlier dry run didn't actually map or protect anything. Run again in
+  // non-dry run mode.
+  return success && InitShadow(__msan_get_track_origins(), false);
+}
+
 static void MsanAtExit(void) {
   if (flags()->print_stats && (flags()->atexit || msan_report_count > 0))
     ReportStats();
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 867ae73f0d3b27..f3b457d786e6bd 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -677,6 +677,7 @@ static void initializeProfileForContinuousMode(void) {
       PROF_ERR("Continuous counter sync mode is enabled, but raw profile is not"
                "page-aligned. CurrentFileOffset = %" PRIu64 ", pagesz = %u.\n",
                (uint64_t)CurrentFileOffset, PageSize);
+      fclose(File);
       return;
     }
     if (writeProfileWithFileObject(Filename, File) != 0) {
@@ -692,6 +693,8 @@ static void initializeProfileForContinuousMode(void) {
 
   if (doMerging()) {
     lprofUnlockFileHandle(File);
+  }
+  if (File != NULL) {
     fclose(File);
   }
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
index 9f46a98d78ac4e..002bec164d7e85 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
@@ -195,6 +195,8 @@ static const int dummy_name[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_NAME_SECT_NAME);
 static int dummy_vnds[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_VNODES_SECT_NAME);
+static int dummy_orderfile[0] COMPILER_RT_SECTION(
+    COMPILER_RT_SEG INSTR_PROF_ORDERFILE_SECT_NAME);
 
 // To avoid GC'ing of the dummy variables by the linker, reference them in an
 // array and reference the array in the runtime registration code
@@ -206,7 +208,7 @@ static int dummy_vnds[0] COMPILER_RT_SECTION(
 COMPILER_RT_VISIBILITY
 void *__llvm_profile_keep[] = {(void *)&dummy_cnts, (void *)&dummy_bits,
                                (void *)&dummy_data, (void *)&dummy_name,
-                               (void *)&dummy_vnds};
+                               (void *)&dummy_vnds, (void *)&dummy_orderfile};
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
index c976776ae59e9c..0751b28f81d0ac 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
@@ -77,7 +77,7 @@ ValueProfNode *EndVNode = &VNodesEnd;
 /* lld-link provides __buildid symbol which ponits to the 16 bytes build id when
  * using /build-id flag. https://lld.llvm.org/windows_support.html#lld-flags */
 #define BUILD_ID_LEN 16
-COMPILER_RT_WEAK extern uint8_t __buildid[BUILD_ID_LEN];
+COMPILER_RT_WEAK uint8_t __buildid[BUILD_ID_LEN];
 COMPILER_RT_VISIBILITY int __llvm_write_binary_ids(ProfDataWriter *Writer) {
   if (*__buildid) {
     if (Writer &&
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 1b56bebac64e68..3ecdb55cdbf72f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -10218,20 +10218,6 @@ INTERCEPTOR(int, __xuname, int size, void *utsname) {
 #define INIT___XUNAME
 #endif
 
-#if SANITIZER_INTERCEPT_HEXDUMP
-INTERCEPTOR(void, hexdump, const void *ptr, int length, const char *header, int flags) {
-  void *ctx;
-  COMMON_INTERCEPTOR_ENTER(ctx, hexdump, ptr, length, header, flags);
-  COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, length);
-  COMMON_INTERCEPTOR_READ_RANGE(ctx, header, internal_strlen(header) + 1);
-  REAL(hexdump)(ptr, length, header, flags);
-}
-
-#define INIT_HEXDUMP COMMON_INTERCEPT_FUNCTION(hexdump);
-#else
-#define INIT_HEXDUMP
-#endif
-
 #if SANITIZER_INTERCEPT_ARGP_PARSE
 INTERCEPTOR(int, argp_parse, const struct argp *argp, int argc, char **argv,
             unsigned flags, int *arg_index, void *input) {
@@ -10581,7 +10567,6 @@ static void InitializeCommonInterceptors() {
   INIT_PROCCTL
   INIT_UNAME;
   INIT___XUNAME;
-  INIT_HEXDUMP;
   INIT_ARGP_PARSE;
   INIT_CPUSET_GETAFFINITY;
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 0ce4e9351bc1da..de55c736d0e144 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -596,7 +596,6 @@
 #define SANITIZER_INTERCEPT___XUNAME SI_FREEBSD
 #define SANITIZER_INTERCEPT_FLOPEN SI_FREEBSD
 #define SANITIZER_INTERCEPT_PROCCTL SI_FREEBSD
-#define SANITIZER_INTERCEPT_HEXDUMP SI_FREEBSD
 #define SANITIZER_INTERCEPT_ARGP_PARSE SI_GLIBC
 #define SANITIZER_INTERCEPT_CPUSET_GETAFFINITY SI_FREEBSD
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
index 8438e019591b58..f6b157c07c6557 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
@@ -34,8 +34,10 @@ static bool FrameIsInternal(const SymbolizedStack *frame) {
     return true;
   const char *file = frame->info.file;
   const char *module = frame->info.module;
+  // On Gentoo, the path is g++-*, so there's *not* a missing /.
   if (file && (internal_strstr(file, "/compiler-rt/lib/") ||
-               internal_strstr(file, "/include/c++/")))
+               internal_strstr(file, "/include/c++/") ||
+               internal_strstr(file, "/include/g++")))
     return true;
   if (module && (internal_strstr(module, "libclang_rt.")))
     return true;
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index a9f6673ac44e90..d0282c27043125 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -14,6 +14,7 @@
 
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_errno.h"
+#include "sanitizer_common/sanitizer_glibc_version.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_linux.h"
 #include "sanitizer_common/sanitizer_platform_limits_netbsd.h"
@@ -1613,47 +1614,40 @@ TSAN_INTERCEPTOR(int, __fxstat, int version, int fd, void *buf) {
     FdAccess(thr, pc, fd);
   return REAL(__fxstat)(version, fd, buf);
 }
-#define TSAN_MAYBE_INTERCEPT___FXSTAT TSAN_INTERCEPT(__fxstat)
+
+TSAN_INTERCEPTOR(int, __fxstat64, int version, int fd, void *buf) {
+  SCOPED_TSAN_INTERCEPTOR(__fxstat64, version, fd, buf);
+  if (fd > 0)
+    FdAccess(thr, pc, fd);
+  return REAL(__fxstat64)(version, fd, buf);
+}
+#define TSAN_MAYBE_INTERCEPT___FXSTAT TSAN_INTERCEPT(__fxstat); TSAN_INTERCEPT(__fxstat64)
 #else
 #define TSAN_MAYBE_INTERCEPT___FXSTAT
 #endif
 
+#if !SANITIZER_GLIBC || __GLIBC_PREREQ(2, 33)
 TSAN_INTERCEPTOR(int, fstat, int fd, void *buf) {
-#if SANITIZER_GLIBC
-  SCOPED_TSAN_INTERCEPTOR(__fxstat, 0, fd, buf);
-  if (fd > 0)
-    FdAccess(thr, pc, fd);
-  return REAL(__fxstat)(0, fd, buf);
-#else
   SCOPED_TSAN_INTERCEPTOR(fstat, fd, buf);
   if (fd > 0)
     FdAccess(thr, pc, fd);
   return REAL(fstat)(fd, buf);
-#endif
-}
-
-#if SANITIZER_GLIBC
-TSAN_INTERCEPTOR(int, __fxstat64, int version, int fd, void *buf) {
-  SCOPED_TSAN_INTERCEPTOR(__fxstat64, version, fd, buf);
-  if (fd > 0)
-    FdAccess(thr, pc, fd);
-  return REAL(__fxstat64)(version, fd, buf);
 }
-#define TSAN_MAYBE_INTERCEPT___FXSTAT64 TSAN_INTERCEPT(__fxstat64)
+#  define TSAN_MAYBE_INTERCEPT_FSTAT TSAN_INTERCEPT(fstat)
 #else
-#define TSAN_MAYBE_INTERCEPT___FXSTAT64
+#  define TSAN_MAYBE_INTERCEPT_FSTAT
 #endif
 
-#if SANITIZER_GLIBC
+#if __GLIBC_PREREQ(2, 33)
 TSAN_INTERCEPTOR(int, fstat64, int fd, void *buf) {
-  SCOPED_TSAN_INTERCEPTOR(__fxstat64, 0, fd, buf);
+  SCOPED_TSAN_INTERCEPTOR(fstat64, fd, buf);
   if (fd > 0)
     FdAccess(thr, pc, fd);
-  return REAL(__fxstat64)(0, fd, buf);
+  return REAL(fstat64)(fd, buf);
 }
-#define TSAN_MAYBE_INTERCEPT_FSTAT64 TSAN_INTERCEPT(fstat64)
+#  define TSAN_MAYBE_INTERCEPT_FSTAT64 TSAN_INTERCEPT(fstat64)
 #else
-#define TSAN_MAYBE_INTERCEPT_FSTAT64
+#  define TSAN_MAYBE_INTERCEPT_FSTAT64
 #endif
 
 TSAN_INTERCEPTOR(int, open, const char *name, int oflag, ...) {
@@ -2950,10 +2944,9 @@ void InitializeInterceptors() {
 
   TSAN_INTERCEPT(pthread_once);
 
-  TSAN_INTERCEPT(fstat);
   TSAN_MAYBE_INTERCEPT___FXSTAT;
+  TSAN_MAYBE_INTERCEPT_FSTAT;
   TSAN_MAYBE_INTERCEPT_FSTAT64;
-  TSAN_MAYBE_INTERCEPT___FXSTAT64;
   TSAN_INTERCEPT(open);
   TSAN_MAYBE_INTERCEPT_OPEN64;
   TSAN_INTERCEPT(creat);
diff --git a/compiler-rt/test/profile/AIX/bexpfull-pgo.c b/compiler-rt/test/profile/AIX/bexpfull-pgo.c
new file mode 100644
index 00000000000000..f48242ec6bfeaa
--- /dev/null
+++ b/compiler-rt/test/profile/AIX/bexpfull-pgo.c
@@ -0,0 +1,7 @@
+// RUN: %clang_pgogen %s -bexpall
+// RUN: %clang_pgogen %s -bexpfull
+
+#include <string.h>
+int ar[10];
+int n;
+int main() { memcpy(ar, ar + 1, n); };
diff --git a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/hexdump.cc b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/hexdump.cc
deleted file mode 100644
index e07650d64102a4..00000000000000
--- a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/hexdump.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: %clangxx -O0 -g %s -o %t -lutil && %run %t 2>&1 | FileCheck %s
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <libutil.h>
-
-int main(void) {
-  printf("hexdump");
-  char *line;
-  size_t lineno = 0, len;
-  const char *delim = "\\\\#";
-  FILE *fp = fopen("/etc/fstab", "r");
-  assert(fp);
-  line = fparseln(fp, &len, &lineno, delim, 0);
-  hexdump(line, len, nullptr, 0);
-  free(line);
-  fclose(fp);
-  assert(lineno != 0);
-  assert(len > 0);
-
-  return 0;
-}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp
index 1ac04b53491e14..1d1fbf7299e8b4 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp
@@ -3,7 +3,9 @@
 
 // REQUIRES: has_sancovcc,stable-runtime,linux,x86_64-target-arch
 
-// RUN: %clangxx -O0 %s -fsanitize-coverage=inline-8bit-counters,pc-table -o %t
+/// In glibc 2.39+, fprintf has a nonnull attribute. Disable nonnull-attribute,
+/// which would increase counters for ubsan.
+// RUN: %clangxx -O0 %s -fsanitize-coverage=inline-8bit-counters,pc-table -fno-sanitize=nonnull-attribute -o %t
 // RUN: rm -f %t-counters %t-pcs
 // RUN: env %tool_options="cov_8bit_counters_out=%t-counters cov_pcs_out=%t-pcs verbosity=1" %run %t 2>&1 | FileCheck %s
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp
index daa994c8116251..b168954a1c92cf 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp
@@ -7,7 +7,9 @@
 // RUN: rm -rf $DIR
 // RUN: mkdir -p $DIR
 // RUN: cd $DIR
-// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard %s -o %t
+/// In glibc 2.39+, fprintf has a nonnull attribute. Disable nonnull-attribute,
+/// which would increase counters for ubsan.
+// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard -fno-sanitize=nonnull-attribute %s -o %t
 // RUN: %env_tool_opts=coverage=1 %t 2>&1 | FileCheck %s
 // RUN: rm -rf $DIR
 
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
new file mode 100644
index 00000000000000..867273126f8fdf
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+// Wave32
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w32:
+// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w32:
+// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
+}
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
new file mode 100644
index 00000000000000..ad01450c35a76f
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+
+// Wave64
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w64:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w64:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w64:
+// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w64:
+// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w64:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w64:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
+}
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
new file mode 100644
index 00000000000000..317d9a1102ccff
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
@@ -0,0 +1,110 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+typedef half  v16h   __attribute__((ext_vector_type(16)));
+typedef short v16s   __attribute__((ext_vector_type(16)));
+
+// Wave32
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w32:
+// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
+}
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
new file mode 100644
index 00000000000000..eb81234f53a663
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
@@ -0,0 +1,109 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+
+// Wave64
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w64:
+// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
+}
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 5ade2574032977..5ad6d01e8a8ed6 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -852,13 +852,13 @@ used in constant expressions have currently no folding support at all.
 - **Syntax:** `CALL EXECUTE_COMMAND_LINE(COMMAND [, WAIT, EXITSTAT, CMDSTAT, CMDMSG ])`
 - **Arguments:**
 
-  | Argument  | Description                                                  |
-  |-----------|--------------------------------------------------------------|
-  | `COMMAND` | Shall be a default CHARACTER scalar.                         |
-  | `WAIT`    | (Optional) Shall be a default LOGICAL scalar.                |
-  | `EXITSTAT`| (Optional) Shall be an INTEGER of the default kind.          |
-  | `CMDSTAT` | (Optional) Shall be an INTEGER of the default kind.          |
-  | `CMDMSG`  | (Optional) Shall be a CHARACTER scalar of the default kind.  |
+| Argument   | Description                                                           |
+|------------|-----------------------------------------------------------------------|
+| `COMMAND`  | Shall be a default CHARACTER scalar.                                  |
+| `WAIT`     | (Optional) Shall be a default LOGICAL scalar.                         |
+| `EXITSTAT` | (Optional) Shall be an INTEGER with kind greater than or equal to 4.  |
+| `CMDSTAT`  | (Optional) Shall be an INTEGER with kind greater than or equal to 2.  |
+| `CMDMSG`   | (Optional) Shall be a CHARACTER scalar of the default kind.           |
 
 #### Implementation Specifics
 
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index da6d5970089884..1701a475942ff5 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -78,6 +78,8 @@ static constexpr CategorySet AnyType{IntrinsicType | DerivedType};
 ENUM_CLASS(KindCode, none, defaultIntegerKind,
     defaultRealKind, // is also the default COMPLEX kind
     doublePrecision, defaultCharKind, defaultLogicalKind,
+    greaterOrEqualToKind, // match kind value greater than or equal to a single
+                          // explicit kind value
     any, // matches any kind value; each instance is independent
     // match any kind, but all "same" kinds must be equal. For characters, also
     // implies that lengths must be equal.
@@ -104,7 +106,7 @@ ENUM_CLASS(KindCode, none, defaultIntegerKind,
 struct TypePattern {
   CategorySet categorySet;
   KindCode kindCode{KindCode::none};
-  int exactKindValue{0}; // for KindCode::exactKind
+  int kindValue{0}; // for KindCode::exactKind and greaterOrEqualToKind
   llvm::raw_ostream &Dump(llvm::raw_ostream &) const;
 };
 
@@ -1314,10 +1316,11 @@ static const IntrinsicInterface intrinsicSubroutine[]{
     {"execute_command_line",
         {{"command", DefaultChar, Rank::scalar},
             {"wait", AnyLogical, Rank::scalar, Optionality::optional},
-            {"exitstat", AnyInt, Rank::scalar, Optionality::optional,
-                common::Intent::InOut},
-            {"cmdstat", AnyInt, Rank::scalar, Optionality::optional,
-                common::Intent::Out},
+            {"exitstat",
+                TypePattern{IntType, KindCode::greaterOrEqualToKind, 4},
+                Rank::scalar, Optionality::optional, common::Intent::InOut},
+            {"cmdstat", TypePattern{IntType, KindCode::greaterOrEqualToKind, 2},
+                Rank::scalar, Optionality::optional, common::Intent::Out},
             {"cmdmsg", DefaultChar, Rank::scalar, Optionality::optional,
                 common::Intent::InOut}},
         {}, Rank::elemental, IntrinsicClass::impureSubroutine},
@@ -1834,7 +1837,10 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
       argOk = true;
       break;
     case KindCode::exactKind:
-      argOk = type->kind() == d.typePattern.exactKindValue;
+      argOk = type->kind() == d.typePattern.kindValue;
+      break;
+    case KindCode::greaterOrEqualToKind:
+      argOk = type->kind() >= d.typePattern.kindValue;
       break;
     case KindCode::sameAtom:
       if (!sameArg) {
@@ -2177,8 +2183,9 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
       resultType = DynamicType{
           GetBuiltinDerivedType(builtinsScope, "__builtin_team_type")};
       break;
+    case KindCode::greaterOrEqualToKind:
     case KindCode::exactKind:
-      resultType = DynamicType{*category, result.exactKindValue};
+      resultType = DynamicType{*category, result.kindValue};
       break;
     case KindCode::typeless:
     case KindCode::any:
diff --git a/flang/test/Semantics/execute_command_line.f90 b/flang/test/Semantics/execute_command_line.f90
new file mode 100644
index 00000000000000..a66bbce705715d
--- /dev/null
+++ b/flang/test/Semantics/execute_command_line.f90
@@ -0,0 +1,29 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
+! Tests for the EXECUTE_COMMAND_LINE intrinsics
+
+subroutine bad_kind_error(command, exitVal, cmdVal)
+CHARACTER(30) :: command
+INTEGER(KIND=2) :: exitVal
+INTEGER(KIND=1) :: cmdVal
+!ERROR: Actual argument for 'exitstat=' has bad type or kind 'INTEGER(2)'
+call execute_command_line(command, exitstat=exitVal)
+
+!ERROR: Actual argument for 'cmdstat=' has bad type or kind 'INTEGER(1)'
+call execute_command_line(command, cmdstat=cmdVal)
+end subroutine bad_kind_error
+
+subroutine good_kind_equal(command, exitVal, cmdVal)
+CHARACTER(30) :: command
+INTEGER(KIND=4) :: exitVal
+INTEGER(KIND=2) :: cmdVal
+call execute_command_line(command, exitstat=exitVal)
+call execute_command_line(command, cmdstat=cmdVal)
+end subroutine good_kind_equal
+
+subroutine good_kind_greater(command, exitVal, cmdVal)
+CHARACTER(30) :: command
+INTEGER(KIND=8) :: exitVal
+INTEGER(KIND=4) :: cmdVal
+call execute_command_line(command, exitstat=exitVal)
+call execute_command_line(command, cmdstat=cmdVal)
+end subroutine good_kind_greater
diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst
index 533c3fbd2a1eea..ee2b81d3b9e7ca 100644
--- a/libcxx/docs/Modules.rst
+++ b/libcxx/docs/Modules.rst
@@ -218,9 +218,13 @@ Building this project is done with the following steps, assuming the files
 
   $ mkdir build
   $ cmake -G Ninja -S . -B build -DCMAKE_CXX_COMPILER=<path-to-compiler> -DLIBCXX_BUILD=<build>
+  $ ninja -j1 std -C build
   $ ninja -C build
   $ build/main
 
+.. note:: The ``std`` dependencies of ``std.compat`` is not always resolved when
+          building the ``std`` target using multiple jobs.
+
 .. warning:: ``<path-to-compiler>`` should point point to the real binary and
              not to a symlink.
 
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index fb3d2af544c287..7ea13e6943dd4c 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -125,6 +125,8 @@ Improvements and New Features
   ``${PREFIX}/share/libc++/v1``.
 
 - AddressSanitizer annotations have been added to ``std::basic_string``.
+  These annotations are enabled for all allocators by default.
+  It's only enabled for long strings, strings using the small buffer optimization are not annotated.
 
 - The libc++ source code has been formatted with ``clang-format``. This
   `discourse thread <https://discourse.llvm.org/t/rfc-clang-formatting-all-of-libc-once-and-for-all>`_
@@ -275,11 +277,10 @@ ABI Affecting Changes
   results in an ABI break, however in practice we expect uses of ``std::projected`` in ABI-sensitive places to be
   extremely rare. Any error resulting from this change should result in a link-time error.
 
-- Under the unstable ABI, the internal alignment requirements for heap allocations
-  inside ``std::string`` has decreased from 16 to 8. This saves memory since string requests fewer additional
-  bytes than it did previously. However, this also changes the return value of ``std::string::max_size``
-  and can cause code compiled against older libc++ versions but linked at runtime to a new version
-  to throw a different exception when attempting allocations that are too large
+- The internal alignment requirements for heap allocations inside ``std::string`` has decreased from 16 to 8. This
+  saves memory since string requests fewer additional bytes than it did previously. However, this also changes the
+  return value of ``std::string::max_size`` and can cause code compiled against older libc++ versions but linked at
+  runtime to a new version to throw a different exception when attempting allocations that are too large
   (``std::bad_alloc`` vs ``std::length_error``).
 
 - The layout of some range adaptors that use the ``movable-box`` exposition-only type as an implementation
@@ -342,3 +343,6 @@ Build System Changes
   them so that they can interoperate with other system-provided libraries that might be using a different unwinding
   library (such as ``libgcc_s``), you should pass ``LIBCXXABI_USE_LLVM_UNWINDER=OFF`` and ``COMPILER_RT_USE_LLVM_UNWINDER=OFF``
   to make sure the system-provided unwinding library is used by the LLVM runtimes.
+
+- In Clang-cl configurations, libc++ can now be built against the static and/or
+  debug MSVC C runtimes
diff --git a/libcxx/include/__algorithm/copy_move_common.h b/libcxx/include/__algorithm/copy_move_common.h
index b350507e32bae9..0fc7a5e3cee700 100644
--- a/libcxx/include/__algorithm/copy_move_common.h
+++ b/libcxx/include/__algorithm/copy_move_common.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // Type traits.
@@ -132,4 +135,6 @@ __dispatch_copy_or_move(_InIter __first, _Sent __last, _OutIter __out_first) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_COPY_MOVE_COMMON_H
diff --git a/libcxx/include/__algorithm/equal.h b/libcxx/include/__algorithm/equal.h
index f03f010aa51ab3..3c0e3060e39a99 100644
--- a/libcxx/include/__algorithm/equal.h
+++ b/libcxx/include/__algorithm/equal.h
@@ -30,6 +30,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
@@ -162,4 +165,6 @@ equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_EQUAL_H
diff --git a/libcxx/include/__algorithm/equal_range.h b/libcxx/include/__algorithm/equal_range.h
index 7ce54965fff05f..a94290431971c4 100644
--- a/libcxx/include/__algorithm/equal_range.h
+++ b/libcxx/include/__algorithm/equal_range.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _Iter, class _Sent, class _Tp, class _Proj>
@@ -77,4 +80,6 @@ equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __valu
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_EQUAL_RANGE_H
diff --git a/libcxx/include/__algorithm/fold.h b/libcxx/include/__algorithm/fold.h
index 88e6814d5cf99d..1a9d76b50d83c9 100644
--- a/libcxx/include/__algorithm/fold.h
+++ b/libcxx/include/__algorithm/fold.h
@@ -32,6 +32,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 23
@@ -122,4 +125,6 @@ inline constexpr auto fold_left = __fold_left();
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_FOLD_H
diff --git a/libcxx/include/__algorithm/in_found_result.h b/libcxx/include/__algorithm/in_found_result.h
index 88a0255d169831..a67ae387974c0a 100644
--- a/libcxx/include/__algorithm/in_found_result.h
+++ b/libcxx/include/__algorithm/in_found_result.h
@@ -18,6 +18,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -46,4 +49,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_IN_FOUND_RESULT_H
diff --git a/libcxx/include/__algorithm/in_fun_result.h b/libcxx/include/__algorithm/in_fun_result.h
index 6110c1cf86cd52..a22069a9a8ddaa 100644
--- a/libcxx/include/__algorithm/in_fun_result.h
+++ b/libcxx/include/__algorithm/in_fun_result.h
@@ -18,6 +18,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -46,4 +49,6 @@ struct in_fun_result {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_IN_FUN_RESULT_H
diff --git a/libcxx/include/__algorithm/in_in_out_result.h b/libcxx/include/__algorithm/in_in_out_result.h
index 95ce4f4fd5bd44..ba0380b5c68147 100644
--- a/libcxx/include/__algorithm/in_in_out_result.h
+++ b/libcxx/include/__algorithm/in_in_out_result.h
@@ -18,6 +18,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -51,4 +54,6 @@ struct in_in_out_result {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_IN_IN_OUT_RESULT_H
diff --git a/libcxx/include/__algorithm/in_in_result.h b/libcxx/include/__algorithm/in_in_result.h
index d1d62dae7f6703..994573fc70fd88 100644
--- a/libcxx/include/__algorithm/in_in_result.h
+++ b/libcxx/include/__algorithm/in_in_result.h
@@ -18,6 +18,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -48,4 +51,6 @@ struct in_in_result {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_IN_IN_RESULT_H
diff --git a/libcxx/include/__algorithm/in_out_out_result.h b/libcxx/include/__algorithm/in_out_out_result.h
index 14364236875086..8ceb452841a419 100644
--- a/libcxx/include/__algorithm/in_out_out_result.h
+++ b/libcxx/include/__algorithm/in_out_out_result.h
@@ -18,6 +18,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -49,4 +52,6 @@ struct in_out_out_result {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_IN_OUT_OUT_RESULT_H
diff --git a/libcxx/include/__algorithm/includes.h b/libcxx/include/__algorithm/includes.h
index 531752e9317569..05d45365eb806f 100644
--- a/libcxx/include/__algorithm/includes.h
+++ b/libcxx/include/__algorithm/includes.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Comp, class _Proj1, class _Proj2>
@@ -71,4 +74,6 @@ includes(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_INCLUDES_H
diff --git a/libcxx/include/__algorithm/next_permutation.h b/libcxx/include/__algorithm/next_permutation.h
index d66ea9b973453f..011ee028cc2f52 100644
--- a/libcxx/include/__algorithm/next_permutation.h
+++ b/libcxx/include/__algorithm/next_permutation.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _BidirectionalIterator, class _Sentinel>
@@ -67,4 +70,6 @@ next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last)
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_NEXT_PERMUTATION_H
diff --git a/libcxx/include/__algorithm/nth_element.h b/libcxx/include/__algorithm/nth_element.h
index 37ddfbdacf044d..da748d7255aba6 100644
--- a/libcxx/include/__algorithm/nth_element.h
+++ b/libcxx/include/__algorithm/nth_element.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Compare, class _RandomAccessIterator>
@@ -253,4 +256,6 @@ nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomA
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_NTH_ELEMENT_H
diff --git a/libcxx/include/__algorithm/partial_sort.h b/libcxx/include/__algorithm/partial_sort.h
index 27511a124229bb..85a8fdc77aa228 100644
--- a/libcxx/include/__algorithm/partial_sort.h
+++ b/libcxx/include/__algorithm/partial_sort.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator, class _Sentinel>
@@ -83,4 +86,6 @@ partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Ran
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PARTIAL_SORT_H
diff --git a/libcxx/include/__algorithm/partial_sort_copy.h b/libcxx/include/__algorithm/partial_sort_copy.h
index e7d8df4de89f95..ef7c9d34d94983 100644
--- a/libcxx/include/__algorithm/partial_sort_copy.h
+++ b/libcxx/include/__algorithm/partial_sort_copy.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy,
@@ -98,4 +101,6 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PARTIAL_SORT_COPY_H
diff --git a/libcxx/include/__algorithm/partition.h b/libcxx/include/__algorithm/partition.h
index e2ceb07bf19588..824e49b9ec2149 100644
--- a/libcxx/include/__algorithm/partition.h
+++ b/libcxx/include/__algorithm/partition.h
@@ -19,6 +19,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Predicate, class _AlgPolicy, class _ForwardIterator, class _Sentinel>
@@ -82,4 +85,6 @@ partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PARTITION_H
diff --git a/libcxx/include/__algorithm/prev_permutation.h b/libcxx/include/__algorithm/prev_permutation.h
index 3e4bbb3fbb1670..8d15b6806401d8 100644
--- a/libcxx/include/__algorithm/prev_permutation.h
+++ b/libcxx/include/__algorithm/prev_permutation.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _BidirectionalIterator, class _Sentinel>
@@ -67,4 +70,6 @@ prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last)
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PREV_PERMUTATION_H
diff --git a/libcxx/include/__algorithm/pstl_any_all_none_of.h b/libcxx/include/__algorithm/pstl_any_all_none_of.h
index d93fdba2224c9b..4b1e0e61b54218 100644
--- a/libcxx/include/__algorithm/pstl_any_all_none_of.h
+++ b/libcxx/include/__algorithm/pstl_any_all_none_of.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -144,4 +147,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_ANY_ALL_NONE_OF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index ab2e3172b8b63b..14a0d76741d4c5 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -194,4 +197,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H
diff --git a/libcxx/include/__algorithm/pstl_copy.h b/libcxx/include/__algorithm/pstl_copy.h
index 19f275a0d5d97f..1069dcec0e117a 100644
--- a/libcxx/include/__algorithm/pstl_copy.h
+++ b/libcxx/include/__algorithm/pstl_copy.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -113,4 +116,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_COPY_H
diff --git a/libcxx/include/__algorithm/pstl_count.h b/libcxx/include/__algorithm/pstl_count.h
index 28806fca063701..2781f6bfd3c9e0 100644
--- a/libcxx/include/__algorithm/pstl_count.h
+++ b/libcxx/include/__algorithm/pstl_count.h
@@ -29,6 +29,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -113,4 +116,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_COUNT_H
diff --git a/libcxx/include/__algorithm/pstl_equal.h b/libcxx/include/__algorithm/pstl_equal.h
index b343d2675980c9..d235c0f4f41972 100644
--- a/libcxx/include/__algorithm/pstl_equal.h
+++ b/libcxx/include/__algorithm/pstl_equal.h
@@ -21,6 +21,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -167,4 +170,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_EQUAL_H
diff --git a/libcxx/include/__algorithm/pstl_fill.h b/libcxx/include/__algorithm/pstl_fill.h
index 3057dcc04f1ad7..488b49a0feec96 100644
--- a/libcxx/include/__algorithm/pstl_fill.h
+++ b/libcxx/include/__algorithm/pstl_fill.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -108,4 +111,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_find.h b/libcxx/include/__algorithm/pstl_find.h
index adc05ea1a9e55a..5b694db68aead4 100644
--- a/libcxx/include/__algorithm/pstl_find.h
+++ b/libcxx/include/__algorithm/pstl_find.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -133,4 +136,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_FIND_H
diff --git a/libcxx/include/__algorithm/pstl_for_each.h b/libcxx/include/__algorithm/pstl_for_each.h
index 819a43d685abed..bb7b5a61a6dc0d 100644
--- a/libcxx/include/__algorithm/pstl_for_each.h
+++ b/libcxx/include/__algorithm/pstl_for_each.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -100,4 +103,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_generate.h b/libcxx/include/__algorithm/pstl_generate.h
index 56538392d5b5dd..7133c6f4f4c621 100644
--- a/libcxx/include/__algorithm/pstl_generate.h
+++ b/libcxx/include/__algorithm/pstl_generate.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -106,4 +109,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_GENERATE_H
diff --git a/libcxx/include/__algorithm/pstl_is_partitioned.h b/libcxx/include/__algorithm/pstl_is_partitioned.h
index 39cf6369339db6..b6543021220727 100644
--- a/libcxx/include/__algorithm/pstl_is_partitioned.h
+++ b/libcxx/include/__algorithm/pstl_is_partitioned.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -69,4 +72,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_IS_PARITTIONED
diff --git a/libcxx/include/__algorithm/pstl_merge.h b/libcxx/include/__algorithm/pstl_merge.h
index ed801451086326..3d262db6bc0c15 100644
--- a/libcxx/include/__algorithm/pstl_merge.h
+++ b/libcxx/include/__algorithm/pstl_merge.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -84,4 +87,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_MERGE_H
diff --git a/libcxx/include/__algorithm/pstl_move.h b/libcxx/include/__algorithm/pstl_move.h
index 52baab57591e26..d8441f1a6c2e16 100644
--- a/libcxx/include/__algorithm/pstl_move.h
+++ b/libcxx/include/__algorithm/pstl_move.h
@@ -27,6 +27,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -76,4 +79,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_MOVE_H
diff --git a/libcxx/include/__algorithm/pstl_replace.h b/libcxx/include/__algorithm/pstl_replace.h
index 05dee3f6a4f30c..b1caf3fd4ac0a1 100644
--- a/libcxx/include/__algorithm/pstl_replace.h
+++ b/libcxx/include/__algorithm/pstl_replace.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -239,4 +242,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_REPLACE_H
diff --git a/libcxx/include/__algorithm/pstl_rotate_copy.h b/libcxx/include/__algorithm/pstl_rotate_copy.h
index 33dc9a3635f7e8..346aab1d4a55c0 100644
--- a/libcxx/include/__algorithm/pstl_rotate_copy.h
+++ b/libcxx/include/__algorithm/pstl_rotate_copy.h
@@ -19,6 +19,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -77,4 +80,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_ROTATE_COPY_H
diff --git a/libcxx/include/__algorithm/pstl_sort.h b/libcxx/include/__algorithm/pstl_sort.h
index 3e71e0aa5ae0a1..a931f768111a23 100644
--- a/libcxx/include/__algorithm/pstl_sort.h
+++ b/libcxx/include/__algorithm/pstl_sort.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_SORT_H
diff --git a/libcxx/include/__algorithm/pstl_stable_sort.h b/libcxx/include/__algorithm/pstl_stable_sort.h
index c9d375535fc450..8ea0bb3f9a8d59 100644
--- a/libcxx/include/__algorithm/pstl_stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_stable_sort.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -53,4 +56,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/pstl_transform.h b/libcxx/include/__algorithm/pstl_transform.h
index aad59d1f30e6b9..f95938782fc3bd 100644
--- a/libcxx/include/__algorithm/pstl_transform.h
+++ b/libcxx/include/__algorithm/pstl_transform.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -112,4 +115,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_TRANSFORM_H
diff --git a/libcxx/include/__algorithm/ranges_all_of.h b/libcxx/include/__algorithm/ranges_all_of.h
index 39a2ae4de01e99..8976541d590cad 100644
--- a/libcxx/include/__algorithm/ranges_all_of.h
+++ b/libcxx/include/__algorithm/ranges_all_of.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -66,4 +69,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_ALL_OF_H
diff --git a/libcxx/include/__algorithm/ranges_any_of.h b/libcxx/include/__algorithm/ranges_any_of.h
index 2ca8531102eac6..7c775f5f64dec0 100644
--- a/libcxx/include/__algorithm/ranges_any_of.h
+++ b/libcxx/include/__algorithm/ranges_any_of.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -66,4 +69,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_ANY_OF_H
diff --git a/libcxx/include/__algorithm/ranges_binary_search.h b/libcxx/include/__algorithm/ranges_binary_search.h
index 22008e0f1bc8f6..f3b7842d5cccd4 100644
--- a/libcxx/include/__algorithm/ranges_binary_search.h
+++ b/libcxx/include/__algorithm/ranges_binary_search.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -65,4 +68,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_BINARY_SEARCH_H
diff --git a/libcxx/include/__algorithm/ranges_clamp.h b/libcxx/include/__algorithm/ranges_clamp.h
index a1185e7278f0ed..f5ef5fd7f26ec8 100644
--- a/libcxx/include/__algorithm/ranges_clamp.h
+++ b/libcxx/include/__algorithm/ranges_clamp.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -58,4 +61,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_CLAMP_H
diff --git a/libcxx/include/__algorithm/ranges_contains.h b/libcxx/include/__algorithm/ranges_contains.h
index f92fcec587d858..00d0e54019887c 100644
--- a/libcxx/include/__algorithm/ranges_contains.h
+++ b/libcxx/include/__algorithm/ranges_contains.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 23
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -58,4 +61,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 23
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_CONTAINS_H
diff --git a/libcxx/include/__algorithm/ranges_copy.h b/libcxx/include/__algorithm/ranges_copy.h
index 1c87f074e7cab9..e1d6d32f05f7e6 100644
--- a/libcxx/include/__algorithm/ranges_copy.h
+++ b/libcxx/include/__algorithm/ranges_copy.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -63,4 +66,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_COPY_H
diff --git a/libcxx/include/__algorithm/ranges_copy_backward.h b/libcxx/include/__algorithm/ranges_copy_backward.h
index 865e944d4384dd..93e326042503fd 100644
--- a/libcxx/include/__algorithm/ranges_copy_backward.h
+++ b/libcxx/include/__algorithm/ranges_copy_backward.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -61,4 +64,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_COPY_BACKWARD_H
diff --git a/libcxx/include/__algorithm/ranges_copy_if.h b/libcxx/include/__algorithm/ranges_copy_if.h
index b77dbd37fcee3a..4b41d2154e7f83 100644
--- a/libcxx/include/__algorithm/ranges_copy_if.h
+++ b/libcxx/include/__algorithm/ranges_copy_if.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -79,4 +82,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_COPY_IF_H
diff --git a/libcxx/include/__algorithm/ranges_copy_n.h b/libcxx/include/__algorithm/ranges_copy_n.h
index 99e8eee14d0f83..4353fa99278c8b 100644
--- a/libcxx/include/__algorithm/ranges_copy_n.h
+++ b/libcxx/include/__algorithm/ranges_copy_n.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -73,4 +76,6 @@ inline constexpr auto copy_n = __copy_n::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_COPY_N_H
diff --git a/libcxx/include/__algorithm/ranges_count.h b/libcxx/include/__algorithm/ranges_count.h
index 4c8f1b2cbea7e4..a8965c1b961f33 100644
--- a/libcxx/include/__algorithm/ranges_count.h
+++ b/libcxx/include/__algorithm/ranges_count.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -58,4 +61,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_COUNT_H
diff --git a/libcxx/include/__algorithm/ranges_count_if.h b/libcxx/include/__algorithm/ranges_count_if.h
index 92f37d049e0c4d..71b942dd5322b7 100644
--- a/libcxx/include/__algorithm/ranges_count_if.h
+++ b/libcxx/include/__algorithm/ranges_count_if.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -71,4 +74,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_COUNT_IF_H
diff --git a/libcxx/include/__algorithm/ranges_ends_with.h b/libcxx/include/__algorithm/ranges_ends_with.h
index 2afb74bff0f152..c2a3cae9f3b16a 100644
--- a/libcxx/include/__algorithm/ranges_ends_with.h
+++ b/libcxx/include/__algorithm/ranges_ends_with.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 23
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -193,4 +196,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 23
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_ENDS_WITH_H
diff --git a/libcxx/include/__algorithm/ranges_equal.h b/libcxx/include/__algorithm/ranges_equal.h
index 4cb1f7df1952e5..31c7ee261da61f 100644
--- a/libcxx/include/__algorithm/ranges_equal.h
+++ b/libcxx/include/__algorithm/ranges_equal.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -101,4 +104,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_EQUAL_H
diff --git a/libcxx/include/__algorithm/ranges_equal_range.h b/libcxx/include/__algorithm/ranges_equal_range.h
index 1ff8856ca03f1e..4c1c3834ba9f9f 100644
--- a/libcxx/include/__algorithm/ranges_equal_range.h
+++ b/libcxx/include/__algorithm/ranges_equal_range.h
@@ -30,6 +30,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -72,4 +75,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_EQUAL_RANGE_H
diff --git a/libcxx/include/__algorithm/ranges_fill.h b/libcxx/include/__algorithm/ranges_fill.h
index 88a892f5c27865..7a177d85e9f07f 100644
--- a/libcxx/include/__algorithm/ranges_fill.h
+++ b/libcxx/include/__algorithm/ranges_fill.h
@@ -20,6 +20,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -54,4 +57,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_FILL_H
diff --git a/libcxx/include/__algorithm/ranges_fill_n.h b/libcxx/include/__algorithm/ranges_fill_n.h
index dbd8ec27aef9f5..a6e988c0089ce4 100644
--- a/libcxx/include/__algorithm/ranges_fill_n.h
+++ b/libcxx/include/__algorithm/ranges_fill_n.h
@@ -17,6 +17,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -45,4 +48,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_FILL_N_H
diff --git a/libcxx/include/__algorithm/ranges_find.h b/libcxx/include/__algorithm/ranges_find.h
index de870e381184c6..7459fad717a5d6 100644
--- a/libcxx/include/__algorithm/ranges_find.h
+++ b/libcxx/include/__algorithm/ranges_find.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -72,4 +75,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_FIND_H
diff --git a/libcxx/include/__algorithm/ranges_find_end.h b/libcxx/include/__algorithm/ranges_find_end.h
index 2c57ad424bfdea..0bda4f3e1cea9e 100644
--- a/libcxx/include/__algorithm/ranges_find_end.h
+++ b/libcxx/include/__algorithm/ranges_find_end.h
@@ -27,6 +27,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -95,4 +98,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_FIND_END_H
diff --git a/libcxx/include/__algorithm/ranges_find_first_of.h b/libcxx/include/__algorithm/ranges_find_first_of.h
index ec6d52c63250b5..63a7b8335faaf5 100644
--- a/libcxx/include/__algorithm/ranges_find_first_of.h
+++ b/libcxx/include/__algorithm/ranges_find_first_of.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -98,4 +101,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_FIND_FIRST_OF_H
diff --git a/libcxx/include/__algorithm/ranges_find_if.h b/libcxx/include/__algorithm/ranges_find_if.h
index af54a5007ee259..52ae55ce96c366 100644
--- a/libcxx/include/__algorithm/ranges_find_if.h
+++ b/libcxx/include/__algorithm/ranges_find_if.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -67,4 +70,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_FIND_IF_H
diff --git a/libcxx/include/__algorithm/ranges_find_if_not.h b/libcxx/include/__algorithm/ranges_find_if_not.h
index a18bea43165e0d..60c6796cbbfcc7 100644
--- a/libcxx/include/__algorithm/ranges_find_if_not.h
+++ b/libcxx/include/__algorithm/ranges_find_if_not.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -61,4 +64,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_FIND_IF_NOT_H
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 7878ed26709fb6..225dc774c8764a 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 53ccb9a6035a48..d1fdca34cc5a19 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -56,4 +59,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
diff --git a/libcxx/include/__algorithm/ranges_generate.h b/libcxx/include/__algorithm/ranges_generate.h
index 3ff1e13c422090..e6467198e6ba2f 100644
--- a/libcxx/include/__algorithm/ranges_generate.h
+++ b/libcxx/include/__algorithm/ranges_generate.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -65,4 +68,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_GENERATE_H
diff --git a/libcxx/include/__algorithm/ranges_generate_n.h b/libcxx/include/__algorithm/ranges_generate_n.h
index c025c621a191c2..cd5fd7483ab2c6 100644
--- a/libcxx/include/__algorithm/ranges_generate_n.h
+++ b/libcxx/include/__algorithm/ranges_generate_n.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -57,4 +60,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H
diff --git a/libcxx/include/__algorithm/ranges_includes.h b/libcxx/include/__algorithm/ranges_includes.h
index aa35080c8cfd4b..0bc4c043bd1881 100644
--- a/libcxx/include/__algorithm/ranges_includes.h
+++ b/libcxx/include/__algorithm/ranges_includes.h
@@ -27,6 +27,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -90,4 +93,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_INCLUDES_H
diff --git a/libcxx/include/__algorithm/ranges_inplace_merge.h b/libcxx/include/__algorithm/ranges_inplace_merge.h
index 86001b003d5ca1..d94c0ad4656776 100644
--- a/libcxx/include/__algorithm/ranges_inplace_merge.h
+++ b/libcxx/include/__algorithm/ranges_inplace_merge.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -76,4 +79,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_INPLACE_MERGE_H
diff --git a/libcxx/include/__algorithm/ranges_is_heap.h b/libcxx/include/__algorithm/ranges_is_heap.h
index f298c347b747a0..122368c90d924d 100644
--- a/libcxx/include/__algorithm/ranges_is_heap.h
+++ b/libcxx/include/__algorithm/ranges_is_heap.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_IS_HEAP_H
diff --git a/libcxx/include/__algorithm/ranges_is_heap_until.h b/libcxx/include/__algorithm/ranges_is_heap_until.h
index 73f13fb50440ec..b2705d37a6d345 100644
--- a/libcxx/include/__algorithm/ranges_is_heap_until.h
+++ b/libcxx/include/__algorithm/ranges_is_heap_until.h
@@ -27,6 +27,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_IS_HEAP_UNTIL_H
diff --git a/libcxx/include/__algorithm/ranges_is_partitioned.h b/libcxx/include/__algorithm/ranges_is_partitioned.h
index 76db870efc7073..c6a585c9f51070 100644
--- a/libcxx/include/__algorithm/ranges_is_partitioned.h
+++ b/libcxx/include/__algorithm/ranges_is_partitioned.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -78,4 +81,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_IS_PARTITIONED_H
diff --git a/libcxx/include/__algorithm/ranges_is_permutation.h b/libcxx/include/__algorithm/ranges_is_permutation.h
index 2b99839bc66fa7..e0423d722b5b98 100644
--- a/libcxx/include/__algorithm/ranges_is_permutation.h
+++ b/libcxx/include/__algorithm/ranges_is_permutation.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -99,4 +102,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_IS_PERMUTATION_H
diff --git a/libcxx/include/__algorithm/ranges_is_sorted.h b/libcxx/include/__algorithm/ranges_is_sorted.h
index 3eb2c768d66a20..d71035d5aa1d01 100644
--- a/libcxx/include/__algorithm/ranges_is_sorted.h
+++ b/libcxx/include/__algorithm/ranges_is_sorted.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -59,4 +62,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP__ALGORITHM_RANGES_IS_SORTED_H
diff --git a/libcxx/include/__algorithm/ranges_is_sorted_until.h b/libcxx/include/__algorithm/ranges_is_sorted_until.h
index 19e9875d2757d4..dcfb6a4e1813bd 100644
--- a/libcxx/include/__algorithm/ranges_is_sorted_until.h
+++ b/libcxx/include/__algorithm/ranges_is_sorted_until.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP__ALGORITHM_RANGES_IS_SORTED_UNTIL_H
diff --git a/libcxx/include/__algorithm/ranges_iterator_concept.h b/libcxx/include/__algorithm/ranges_iterator_concept.h
index 9a92030403361b..2af891d3af005a 100644
--- a/libcxx/include/__algorithm/ranges_iterator_concept.h
+++ b/libcxx/include/__algorithm/ranges_iterator_concept.h
@@ -18,6 +18,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -48,4 +51,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_ITERATOR_CONCEPT_H
diff --git a/libcxx/include/__algorithm/ranges_lexicographical_compare.h b/libcxx/include/__algorithm/ranges_lexicographical_compare.h
index 5b843dfd7b3139..90e96b5465169b 100644
--- a/libcxx/include/__algorithm/ranges_lexicographical_compare.h
+++ b/libcxx/include/__algorithm/ranges_lexicographical_compare.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -98,4 +101,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_LEXICOGRAPHICAL_COMPARE_H
diff --git a/libcxx/include/__algorithm/ranges_lower_bound.h b/libcxx/include/__algorithm/ranges_lower_bound.h
index 58b3f815b96a45..ab1f80e7ab7705 100644
--- a/libcxx/include/__algorithm/ranges_lower_bound.h
+++ b/libcxx/include/__algorithm/ranges_lower_bound.h
@@ -27,6 +27,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -65,4 +68,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_LOWER_BOUND_H
diff --git a/libcxx/include/__algorithm/ranges_make_heap.h b/libcxx/include/__algorithm/ranges_make_heap.h
index f17eabff43d2a1..fe9c024fbf8a83 100644
--- a/libcxx/include/__algorithm/ranges_make_heap.h
+++ b/libcxx/include/__algorithm/ranges_make_heap.h
@@ -32,6 +32,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -77,4 +80,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_MAKE_HEAP_H
diff --git a/libcxx/include/__algorithm/ranges_max_element.h b/libcxx/include/__algorithm/ranges_max_element.h
index 2ba97042f1f6e0..83adf49b61ad8f 100644
--- a/libcxx/include/__algorithm/ranges_max_element.h
+++ b/libcxx/include/__algorithm/ranges_max_element.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -61,4 +64,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_MAX_ELEMENT_H
diff --git a/libcxx/include/__algorithm/ranges_merge.h b/libcxx/include/__algorithm/ranges_merge.h
index 7f49154ec9221f..bdf9a62d90bd24 100644
--- a/libcxx/include/__algorithm/ranges_merge.h
+++ b/libcxx/include/__algorithm/ranges_merge.h
@@ -27,6 +27,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -130,4 +133,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_MERGE_H
diff --git a/libcxx/include/__algorithm/ranges_min_element.h b/libcxx/include/__algorithm/ranges_min_element.h
index 07826a0e6b817a..4b9cb76da5789c 100644
--- a/libcxx/include/__algorithm/ranges_min_element.h
+++ b/libcxx/include/__algorithm/ranges_min_element.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_MIN_ELEMENT_H
diff --git a/libcxx/include/__algorithm/ranges_minmax_element.h b/libcxx/include/__algorithm/ranges_minmax_element.h
index a52319f6b5d3fd..5132856ebcd5ca 100644
--- a/libcxx/include/__algorithm/ranges_minmax_element.h
+++ b/libcxx/include/__algorithm/ranges_minmax_element.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -70,4 +73,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_MINMAX_H
diff --git a/libcxx/include/__algorithm/ranges_mismatch.h b/libcxx/include/__algorithm/ranges_mismatch.h
index db9bfc8e87db67..037af39126230a 100644
--- a/libcxx/include/__algorithm/ranges_mismatch.h
+++ b/libcxx/include/__algorithm/ranges_mismatch.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -86,4 +89,6 @@ constexpr inline auto mismatch = __mismatch::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_MISMATCH_H
diff --git a/libcxx/include/__algorithm/ranges_move.h b/libcxx/include/__algorithm/ranges_move.h
index 8bd2409f891c05..be869f36c97304 100644
--- a/libcxx/include/__algorithm/ranges_move.h
+++ b/libcxx/include/__algorithm/ranges_move.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -66,4 +69,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_MOVE_H
diff --git a/libcxx/include/__algorithm/ranges_move_backward.h b/libcxx/include/__algorithm/ranges_move_backward.h
index ee390a40e489a4..6d4071a33b8125 100644
--- a/libcxx/include/__algorithm/ranges_move_backward.h
+++ b/libcxx/include/__algorithm/ranges_move_backward.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -68,4 +71,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_MOVE_BACKWARD_H
diff --git a/libcxx/include/__algorithm/ranges_next_permutation.h b/libcxx/include/__algorithm/ranges_next_permutation.h
index 9ebab3ea7c13bd..18535e0a6254a1 100644
--- a/libcxx/include/__algorithm/ranges_next_permutation.h
+++ b/libcxx/include/__algorithm/ranges_next_permutation.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -70,4 +73,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_NEXT_PERMUTATION_H
diff --git a/libcxx/include/__algorithm/ranges_none_of.h b/libcxx/include/__algorithm/ranges_none_of.h
index b0d363895e000b..59bd87997d448f 100644
--- a/libcxx/include/__algorithm/ranges_none_of.h
+++ b/libcxx/include/__algorithm/ranges_none_of.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -67,4 +70,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_NONE_OF_H
diff --git a/libcxx/include/__algorithm/ranges_nth_element.h b/libcxx/include/__algorithm/ranges_nth_element.h
index 7abdbd0889e0cb..90ade9efe10da6 100644
--- a/libcxx/include/__algorithm/ranges_nth_element.h
+++ b/libcxx/include/__algorithm/ranges_nth_element.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -76,4 +79,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_NTH_ELEMENT_H
diff --git a/libcxx/include/__algorithm/ranges_partial_sort.h b/libcxx/include/__algorithm/ranges_partial_sort.h
index 9ec8882097d784..c67247d2e0a77e 100644
--- a/libcxx/include/__algorithm/ranges_partial_sort.h
+++ b/libcxx/include/__algorithm/ranges_partial_sort.h
@@ -33,6 +33,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_PARTIAL_SORT_H
diff --git a/libcxx/include/__algorithm/ranges_partial_sort_copy.h b/libcxx/include/__algorithm/ranges_partial_sort_copy.h
index eba7d9ac416576..b3bdeb78fb6f65 100644
--- a/libcxx/include/__algorithm/ranges_partial_sort_copy.h
+++ b/libcxx/include/__algorithm/ranges_partial_sort_copy.h
@@ -30,6 +30,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -106,4 +109,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_PARTIAL_SORT_COPY_H
diff --git a/libcxx/include/__algorithm/ranges_partition.h b/libcxx/include/__algorithm/ranges_partition.h
index 89d192b51fd329..a67ac4c967570f 100644
--- a/libcxx/include/__algorithm/ranges_partition.h
+++ b/libcxx/include/__algorithm/ranges_partition.h
@@ -32,6 +32,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -80,4 +83,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_PARTITION_H
diff --git a/libcxx/include/__algorithm/ranges_partition_copy.h b/libcxx/include/__algorithm/ranges_partition_copy.h
index 6a16b02db3e554..d60c865dd2a8a3 100644
--- a/libcxx/include/__algorithm/ranges_partition_copy.h
+++ b/libcxx/include/__algorithm/ranges_partition_copy.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -102,4 +105,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_PARTITION_COPY_H
diff --git a/libcxx/include/__algorithm/ranges_partition_point.h b/libcxx/include/__algorithm/ranges_partition_point.h
index 6fc20e7d00e9f6..c5b11b5fed192a 100644
--- a/libcxx/include/__algorithm/ranges_partition_point.h
+++ b/libcxx/include/__algorithm/ranges_partition_point.h
@@ -27,6 +27,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -85,4 +88,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_PARTITION_POINT_H
diff --git a/libcxx/include/__algorithm/ranges_pop_heap.h b/libcxx/include/__algorithm/ranges_pop_heap.h
index 364cfe94b161e3..01f92c0f228887 100644
--- a/libcxx/include/__algorithm/ranges_pop_heap.h
+++ b/libcxx/include/__algorithm/ranges_pop_heap.h
@@ -32,6 +32,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -78,4 +81,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_POP_HEAP_H
diff --git a/libcxx/include/__algorithm/ranges_prev_permutation.h b/libcxx/include/__algorithm/ranges_prev_permutation.h
index ae7a68cce5fdc6..225cee9b75ec6b 100644
--- a/libcxx/include/__algorithm/ranges_prev_permutation.h
+++ b/libcxx/include/__algorithm/ranges_prev_permutation.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -70,4 +73,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_PREV_PERMUTATION_H
diff --git a/libcxx/include/__algorithm/ranges_push_heap.h b/libcxx/include/__algorithm/ranges_push_heap.h
index 1ed9c953f54c35..9d187af38c5319 100644
--- a/libcxx/include/__algorithm/ranges_push_heap.h
+++ b/libcxx/include/__algorithm/ranges_push_heap.h
@@ -32,6 +32,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -77,4 +80,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_PUSH_HEAP_H
diff --git a/libcxx/include/__algorithm/ranges_remove.h b/libcxx/include/__algorithm/ranges_remove.h
index e27c4bdd733d81..315bed8fba775b 100644
--- a/libcxx/include/__algorithm/ranges_remove.h
+++ b/libcxx/include/__algorithm/ranges_remove.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -60,4 +63,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_REMOVE_H
diff --git a/libcxx/include/__algorithm/ranges_remove_copy.h b/libcxx/include/__algorithm/ranges_remove_copy.h
index 5158a78e481405..84529eceac68c5 100644
--- a/libcxx/include/__algorithm/ranges_remove_copy.h
+++ b/libcxx/include/__algorithm/ranges_remove_copy.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_REMOVE_COPY_H
diff --git a/libcxx/include/__algorithm/ranges_remove_copy_if.h b/libcxx/include/__algorithm/ranges_remove_copy_if.h
index c07b4813d7d0a9..56fe017533120b 100644
--- a/libcxx/include/__algorithm/ranges_remove_copy_if.h
+++ b/libcxx/include/__algorithm/ranges_remove_copy_if.h
@@ -29,6 +29,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -87,4 +90,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_REMOVE_COPY_IF_H
diff --git a/libcxx/include/__algorithm/ranges_remove_if.h b/libcxx/include/__algorithm/ranges_remove_if.h
index 4b7aa2d2be78a8..943dbdd73807e6 100644
--- a/libcxx/include/__algorithm/ranges_remove_if.h
+++ b/libcxx/include/__algorithm/ranges_remove_if.h
@@ -27,6 +27,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -81,4 +84,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_REMOVE_IF_H
diff --git a/libcxx/include/__algorithm/ranges_replace.h b/libcxx/include/__algorithm/ranges_replace.h
index b66a41aa8d0d77..2b88dc032972f4 100644
--- a/libcxx/include/__algorithm/ranges_replace.h
+++ b/libcxx/include/__algorithm/ranges_replace.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -60,4 +63,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_REPLACE_H
diff --git a/libcxx/include/__algorithm/ranges_replace_copy.h b/libcxx/include/__algorithm/ranges_replace_copy.h
index a7627024812fd2..633f993e5c9484 100644
--- a/libcxx/include/__algorithm/ranges_replace_copy.h
+++ b/libcxx/include/__algorithm/ranges_replace_copy.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -85,4 +88,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_REPLACE_COPY_H
diff --git a/libcxx/include/__algorithm/ranges_replace_copy_if.h b/libcxx/include/__algorithm/ranges_replace_copy_if.h
index 10ed1fda6c5c86..e065c3ac0acc90 100644
--- a/libcxx/include/__algorithm/ranges_replace_copy_if.h
+++ b/libcxx/include/__algorithm/ranges_replace_copy_if.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -90,4 +93,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_REPLACE_COPY_IF_H
diff --git a/libcxx/include/__algorithm/ranges_replace_if.h b/libcxx/include/__algorithm/ranges_replace_if.h
index 519fa32029ac67..6445f42aea1908 100644
--- a/libcxx/include/__algorithm/ranges_replace_if.h
+++ b/libcxx/include/__algorithm/ranges_replace_if.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -73,4 +76,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_REPLACE_IF_H
diff --git a/libcxx/include/__algorithm/ranges_reverse_copy.h b/libcxx/include/__algorithm/ranges_reverse_copy.h
index 35b9edba0bfb26..60043787a71705 100644
--- a/libcxx/include/__algorithm/ranges_reverse_copy.h
+++ b/libcxx/include/__algorithm/ranges_reverse_copy.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -62,4 +65,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_REVERSE_COPY_H
diff --git a/libcxx/include/__algorithm/ranges_rotate.h b/libcxx/include/__algorithm/ranges_rotate.h
index ebed9bbd542665..8d33a6f0799bf7 100644
--- a/libcxx/include/__algorithm/ranges_rotate.h
+++ b/libcxx/include/__algorithm/ranges_rotate.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -63,4 +66,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_ROTATE_H
diff --git a/libcxx/include/__algorithm/ranges_rotate_copy.h b/libcxx/include/__algorithm/ranges_rotate_copy.h
index ab76c0944c4771..26fe110b538963 100644
--- a/libcxx/include/__algorithm/ranges_rotate_copy.h
+++ b/libcxx/include/__algorithm/ranges_rotate_copy.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -60,4 +63,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_ROTATE_COPY_H
diff --git a/libcxx/include/__algorithm/ranges_sample.h b/libcxx/include/__algorithm/ranges_sample.h
index d347d82205a89d..e4f60a7b66be2b 100644
--- a/libcxx/include/__algorithm/ranges_sample.h
+++ b/libcxx/include/__algorithm/ranges_sample.h
@@ -27,6 +27,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -66,4 +69,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SAMPLE_H
diff --git a/libcxx/include/__algorithm/ranges_search_n.h b/libcxx/include/__algorithm/ranges_search_n.h
index 4e53f30f71f9d6..4c1d73d8e6c340 100644
--- a/libcxx/include/__algorithm/ranges_search_n.h
+++ b/libcxx/include/__algorithm/ranges_search_n.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -108,4 +111,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SEARCH_N_H
diff --git a/libcxx/include/__algorithm/ranges_set_difference.h b/libcxx/include/__algorithm/ranges_set_difference.h
index a9453ed336f515..0841fb4ffd0c06 100644
--- a/libcxx/include/__algorithm/ranges_set_difference.h
+++ b/libcxx/include/__algorithm/ranges_set_difference.h
@@ -30,6 +30,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -100,4 +103,7 @@ inline constexpr auto set_difference = __set_difference::__fn{};
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SET_DIFFERENCE_H
diff --git a/libcxx/include/__algorithm/ranges_set_intersection.h b/libcxx/include/__algorithm/ranges_set_intersection.h
index 4cdcbb75051a1f..9427379745b60f 100644
--- a/libcxx/include/__algorithm/ranges_set_intersection.h
+++ b/libcxx/include/__algorithm/ranges_set_intersection.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -105,4 +108,7 @@ inline constexpr auto set_intersection = __set_intersection::__fn{};
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SET_INTERSECTION_H
diff --git a/libcxx/include/__algorithm/ranges_set_symmetric_difference.h b/libcxx/include/__algorithm/ranges_set_symmetric_difference.h
index d8710a1c47b0bb..995eb0999d940a 100644
--- a/libcxx/include/__algorithm/ranges_set_symmetric_difference.h
+++ b/libcxx/include/__algorithm/ranges_set_symmetric_difference.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -105,4 +108,7 @@ inline constexpr auto set_symmetric_difference = __set_symmetric_difference::__f
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SET_SYMMETRIC_DIFFERENCE_H
diff --git a/libcxx/include/__algorithm/ranges_set_union.h b/libcxx/include/__algorithm/ranges_set_union.h
index c627166fffed33..e870e390cc6659 100644
--- a/libcxx/include/__algorithm/ranges_set_union.h
+++ b/libcxx/include/__algorithm/ranges_set_union.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -107,4 +110,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SET_UNION_H
diff --git a/libcxx/include/__algorithm/ranges_shuffle.h b/libcxx/include/__algorithm/ranges_shuffle.h
index fca420058dec08..ab98ea22caabec 100644
--- a/libcxx/include/__algorithm/ranges_shuffle.h
+++ b/libcxx/include/__algorithm/ranges_shuffle.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -64,4 +67,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SHUFFLE_H
diff --git a/libcxx/include/__algorithm/ranges_sort.h b/libcxx/include/__algorithm/ranges_sort.h
index 2ad0e0c233be48..0296c146b3edee 100644
--- a/libcxx/include/__algorithm/ranges_sort.h
+++ b/libcxx/include/__algorithm/ranges_sort.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -76,4 +79,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SORT_H
diff --git a/libcxx/include/__algorithm/ranges_sort_heap.h b/libcxx/include/__algorithm/ranges_sort_heap.h
index 365c7dba615674..bab30df1708c75 100644
--- a/libcxx/include/__algorithm/ranges_sort_heap.h
+++ b/libcxx/include/__algorithm/ranges_sort_heap.h
@@ -32,6 +32,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -77,4 +80,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SORT_HEAP_H
diff --git a/libcxx/include/__algorithm/ranges_stable_partition.h b/libcxx/include/__algorithm/ranges_stable_partition.h
index 44937fa5899082..f34027ff772c78 100644
--- a/libcxx/include/__algorithm/ranges_stable_partition.h
+++ b/libcxx/include/__algorithm/ranges_stable_partition.h
@@ -34,6 +34,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -84,4 +87,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_STABLE_PARTITION_H
diff --git a/libcxx/include/__algorithm/ranges_stable_sort.h b/libcxx/include/__algorithm/ranges_stable_sort.h
index a4eed3836356d4..93909e253cc0f2 100644
--- a/libcxx/include/__algorithm/ranges_stable_sort.h
+++ b/libcxx/include/__algorithm/ranges_stable_sort.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/ranges_starts_with.h b/libcxx/include/__algorithm/ranges_starts_with.h
index 7da78001d8148d..90e184aa9bccc2 100644
--- a/libcxx/include/__algorithm/ranges_starts_with.h
+++ b/libcxx/include/__algorithm/ranges_starts_with.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 23
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -87,4 +90,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 23
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_STARTS_WITH_H
diff --git a/libcxx/include/__algorithm/ranges_swap_ranges.h b/libcxx/include/__algorithm/ranges_swap_ranges.h
index 1d0ebc0d5221e1..b6d9f618395a5e 100644
--- a/libcxx/include/__algorithm/ranges_swap_ranges.h
+++ b/libcxx/include/__algorithm/ranges_swap_ranges.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -62,4 +65,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_SWAP_RANGES_H
diff --git a/libcxx/include/__algorithm/ranges_transform.h b/libcxx/include/__algorithm/ranges_transform.h
index f66a07ac026e5f..7850ec4f846560 100644
--- a/libcxx/include/__algorithm/ranges_transform.h
+++ b/libcxx/include/__algorithm/ranges_transform.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -169,4 +172,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_TRANSFORM_H
diff --git a/libcxx/include/__algorithm/ranges_unique.h b/libcxx/include/__algorithm/ranges_unique.h
index b17e01fc50577e..7340310eb36a90 100644
--- a/libcxx/include/__algorithm/ranges_unique.h
+++ b/libcxx/include/__algorithm/ranges_unique.h
@@ -32,6 +32,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -74,4 +77,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_UNIQUE_H
diff --git a/libcxx/include/__algorithm/ranges_unique_copy.h b/libcxx/include/__algorithm/ranges_unique_copy.h
index 7e89f9d97af7f6..61133885ae809d 100644
--- a/libcxx/include/__algorithm/ranges_unique_copy.h
+++ b/libcxx/include/__algorithm/ranges_unique_copy.h
@@ -32,6 +32,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -112,4 +115,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_RANGES_UNIQUE_COPY_H
diff --git a/libcxx/include/__algorithm/remove.h b/libcxx/include/__algorithm/remove.h
index 2b9d4ff26ed2a5..1498852c436130 100644
--- a/libcxx/include/__algorithm/remove.h
+++ b/libcxx/include/__algorithm/remove.h
@@ -18,6 +18,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Tp>
@@ -38,4 +41,6 @@ remove(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_REMOVE_H
diff --git a/libcxx/include/__algorithm/remove_if.h b/libcxx/include/__algorithm/remove_if.h
index 6eceddce8d56b4..c77b78023f529f 100644
--- a/libcxx/include/__algorithm/remove_if.h
+++ b/libcxx/include/__algorithm/remove_if.h
@@ -17,6 +17,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Predicate>
@@ -37,4 +40,6 @@ remove_if(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_REMOVE_IF_H
diff --git a/libcxx/include/__algorithm/reverse.h b/libcxx/include/__algorithm/reverse.h
index 6bd0aa39328068..4167c9116d96e7 100644
--- a/libcxx/include/__algorithm/reverse.h
+++ b/libcxx/include/__algorithm/reverse.h
@@ -19,6 +19,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _BidirectionalIterator>
@@ -54,4 +57,6 @@ reverse(_BidirectionalIterator __first, _BidirectionalIterator __last) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_REVERSE_H
diff --git a/libcxx/include/__algorithm/rotate.h b/libcxx/include/__algorithm/rotate.h
index d8162b1a94b272..9a4d07883e320f 100644
--- a/libcxx/include/__algorithm/rotate.h
+++ b/libcxx/include/__algorithm/rotate.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _ForwardIterator>
@@ -190,4 +193,6 @@ rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __l
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_ROTATE_H
diff --git a/libcxx/include/__algorithm/set_difference.h b/libcxx/include/__algorithm/set_difference.h
index a924702ce5f26c..f414bcecb50df1 100644
--- a/libcxx/include/__algorithm/set_difference.h
+++ b/libcxx/include/__algorithm/set_difference.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Comp, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
@@ -71,4 +74,6 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_d
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SET_DIFFERENCE_H
diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index f2603fe1365ac3..73d888d1b03843 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -21,6 +21,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InIter1, class _InIter2, class _OutIter>
@@ -95,4 +98,6 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_i
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SET_INTERSECTION_H
diff --git a/libcxx/include/__algorithm/set_symmetric_difference.h b/libcxx/include/__algorithm/set_symmetric_difference.h
index 832c3979bfd762..db36665a61365c 100644
--- a/libcxx/include/__algorithm/set_symmetric_difference.h
+++ b/libcxx/include/__algorithm/set_symmetric_difference.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InIter1, class _InIter2, class _OutIter>
@@ -101,4 +104,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_symmetri
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SET_SYMMETRIC_DIFFERENCE_H
diff --git a/libcxx/include/__algorithm/set_union.h b/libcxx/include/__algorithm/set_union.h
index cf48adae03bed3..a79c50fd3cf2f0 100644
--- a/libcxx/include/__algorithm/set_union.h
+++ b/libcxx/include/__algorithm/set_union.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InIter1, class _InIter2, class _OutIter>
@@ -97,4 +100,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_union(
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SET_UNION_H
diff --git a/libcxx/include/__algorithm/shift_left.h b/libcxx/include/__algorithm/shift_left.h
index 645c58c2911924..06cd7c5f87644e 100644
--- a/libcxx/include/__algorithm/shift_left.h
+++ b/libcxx/include/__algorithm/shift_left.h
@@ -17,6 +17,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -51,4 +54,6 @@ shift_left(_ForwardIterator __first,
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SHIFT_LEFT_H
diff --git a/libcxx/include/__algorithm/shift_right.h b/libcxx/include/__algorithm/shift_right.h
index 73ef98bd39deda..01853057fc4788 100644
--- a/libcxx/include/__algorithm/shift_right.h
+++ b/libcxx/include/__algorithm/shift_right.h
@@ -20,6 +20,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -97,4 +100,6 @@ shift_right(_ForwardIterator __first,
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SHIFT_RIGHT_H
diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h
index 451133a2d193dd..8a5e0211cdf4c1 100644
--- a/libcxx/include/__algorithm/sort.h
+++ b/libcxx/include/__algorithm/sort.h
@@ -39,6 +39,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // stable, 2-3 compares, 0-2 swaps
@@ -1009,4 +1012,6 @@ sort(_RandomAccessIterator __first, _RandomAccessIterator __last) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SORT_H
diff --git a/libcxx/include/__algorithm/sort_heap.h b/libcxx/include/__algorithm/sort_heap.h
index 0a6d992d0090e3..060fc33c3c6e93 100644
--- a/libcxx/include/__algorithm/sort_heap.h
+++ b/libcxx/include/__algorithm/sort_heap.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
@@ -55,4 +58,6 @@ sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SORT_HEAP_H
diff --git a/libcxx/include/__algorithm/stable_partition.h b/libcxx/include/__algorithm/stable_partition.h
index 8762abcf18e150..8bb1eaf2d22495 100644
--- a/libcxx/include/__algorithm/stable_partition.h
+++ b/libcxx/include/__algorithm/stable_partition.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Predicate, class _ForwardIterator, class _Distance, class _Pair>
@@ -299,4 +302,6 @@ stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate _
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_STABLE_PARTITION_H
diff --git a/libcxx/include/__algorithm/stable_sort.h b/libcxx/include/__algorithm/stable_sort.h
index ffc6e4ce281888..9be192bd65a6ef 100644
--- a/libcxx/include/__algorithm/stable_sort.h
+++ b/libcxx/include/__algorithm/stable_sort.h
@@ -29,6 +29,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _BidirectionalIterator>
@@ -265,4 +268,6 @@ inline _LIBCPP_HIDE_FROM_ABI void stable_sort(_RandomAccessIterator __first, _Ra
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/swap_ranges.h b/libcxx/include/__algorithm/swap_ranges.h
index 7fab5c49a656fe..54b453b72360e0 100644
--- a/libcxx/include/__algorithm/swap_ranges.h
+++ b/libcxx/include/__algorithm/swap_ranges.h
@@ -18,6 +18,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // 2+2 iterators: the shorter size will be used.
@@ -54,4 +57,6 @@ swap_ranges(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardItera
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SWAP_RANGES_H
diff --git a/libcxx/include/__algorithm/unique.h b/libcxx/include/__algorithm/unique.h
index 1717a00c8a9346..056373d06fe44c 100644
--- a/libcxx/include/__algorithm/unique.h
+++ b/libcxx/include/__algorithm/unique.h
@@ -21,6 +21,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // unique
@@ -56,4 +59,6 @@ unique(_ForwardIterator __first, _ForwardIterator __last) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_UNIQUE_H
diff --git a/libcxx/include/__algorithm/unique_copy.h b/libcxx/include/__algorithm/unique_copy.h
index 81fcd50f011d5d..16ce80cab32f0d 100644
--- a/libcxx/include/__algorithm/unique_copy.h
+++ b/libcxx/include/__algorithm/unique_copy.h
@@ -23,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __unique_copy_tags {
@@ -119,4 +122,6 @@ unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __res
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_UNIQUE_COPY_H
diff --git a/libcxx/include/__algorithm/unwrap_iter.h b/libcxx/include/__algorithm/unwrap_iter.h
index a298a2b271056c..50d815c9708849 100644
--- a/libcxx/include/__algorithm/unwrap_iter.h
+++ b/libcxx/include/__algorithm/unwrap_iter.h
@@ -80,6 +80,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _OrigIter __rewrap_iter(_OrigIter __orig
 
 _LIBCPP_END_NAMESPACE_STD
 
-_LIBCPP_PUSH_MACROS
+_LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___ALGORITHM_UNWRAP_ITER_H
diff --git a/libcxx/include/__algorithm/unwrap_range.h b/libcxx/include/__algorithm/unwrap_range.h
index 053fd550b302ee..2d4b9bb5545ad3 100644
--- a/libcxx/include/__algorithm/unwrap_range.h
+++ b/libcxx/include/__algorithm/unwrap_range.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // __unwrap_range and __rewrap_range are used to unwrap ranges which may have different iterator and sentinel types.
@@ -91,4 +94,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Iter __rewrap_range(_Iter __orig_iter,
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_UNWRAP_RANGE_H
diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index c5069a027750ec..b8b2da9bb12265 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -72,11 +72,10 @@
 #  endif
 #endif
 
-// Availability markup is disabled when building the library, or when the compiler
+// Availability markup is disabled when building the library, or when a non-Clang
+// compiler is used because only Clang supports the necessary attributes.
 // doesn't support the proper attributes.
-#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY) ||                                       \
-    !__has_feature(attribute_availability_with_strict) || !__has_feature(attribute_availability_in_templates) ||       \
-    !__has_extension(pragma_clang_attribute_external_declaration)
+#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY) || !defined(_LIBCPP_COMPILER_CLANG_BASED)
 #  if !defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 #    define _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
 #  endif
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 9032b8f0180937..3a5339b72ddc31 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -173,7 +173,7 @@ private:
 
 // fill_n
 
-template <bool _FillValue, class _Cp>
+template <bool _FillVal, class _Cp>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
 __fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
   using _It            = __bit_iterator<_Cp, false>;
@@ -185,7 +185,7 @@ __fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
     __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
     __storage_type __dn    = std::min(__clz_f, __n);
     __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    if (_FillValue)
+    if (_FillVal)
       *__first.__seg_ |= __m;
     else
       *__first.__seg_ &= ~__m;
@@ -194,13 +194,13 @@ __fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
   }
   // do middle whole words
   __storage_type __nw = __n / __bits_per_word;
-  std::fill_n(std::__to_address(__first.__seg_), __nw, _FillValue ? static_cast<__storage_type>(-1) : 0);
+  std::fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
   __n -= __nw * __bits_per_word;
   // do last partial word
   if (__n > 0) {
     __first.__seg_ += __nw;
     __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    if (_FillValue)
+    if (_FillVal)
       *__first.__seg_ |= __m;
     else
       *__first.__seg_ &= ~__m;
@@ -1007,7 +1007,7 @@ private:
   friend class __bit_iterator<_Cp, true>;
   template <class _Dp>
   friend struct __bit_array;
-  template <bool _FillValue, class _Dp>
+  template <bool _FillVal, class _Dp>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void __fill_n(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
 
   template <class _Dp, bool _IC>
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 9557e8e8cf97f2..8b2eaf69d17042 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -62,7 +62,7 @@
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
 // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is
 // defined to XXYYZZ.
-#  define _LIBCPP_VERSION 180000
+#  define _LIBCPP_VERSION 180100
 
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
@@ -174,11 +174,6 @@
 // The implementation moved to the header, but we still export the symbols from
 // the dylib for backwards compatibility.
 #    define _LIBCPP_ABI_DO_NOT_EXPORT_TO_CHARS_BASE_10
-// Save memory by providing the allocator more freedom to allocate the most
-// efficient size class by dropping the alignment requirements for std::string's
-// pointer from 16 to 8. This changes the output of std::string::max_size,
-// which makes it ABI breaking
-#    define _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
 #  elif _LIBCPP_ABI_VERSION == 1
 #    if !(defined(_LIBCPP_OBJECT_FORMAT_COFF) || defined(_LIBCPP_OBJECT_FORMAT_XCOFF))
 // Enable compiling copies of now inline methods into the dylib to support
@@ -1280,8 +1275,8 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 #  endif // _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES
 
 // clang-format off
-#  define _LIBCPP_PUSH_MACROS _Pragma("push_macro(\"min\")") _Pragma("push_macro(\"max\")") _Pragma("push_macro(\"refresh()\")") _Pragma("push_macro(\"move(int, int)\")") _Pragma("push_macro(\"erase()\")")
-#  define _LIBCPP_POP_MACROS _Pragma("pop_macro(\"min\")") _Pragma("pop_macro(\"max\")") _Pragma("pop_macro(\"refresh()\")") _Pragma("pop_macro(\"move(int, int)\")") _Pragma("pop_macro(\"erase()\")")
+#  define _LIBCPP_PUSH_MACROS _Pragma("push_macro(\"min\")") _Pragma("push_macro(\"max\")") _Pragma("push_macro(\"refresh\")") _Pragma("push_macro(\"move\")") _Pragma("push_macro(\"erase\")")
+#  define _LIBCPP_POP_MACROS _Pragma("pop_macro(\"min\")") _Pragma("pop_macro(\"max\")") _Pragma("pop_macro(\"refresh\")") _Pragma("pop_macro(\"move\")") _Pragma("pop_macro(\"erase\")")
 // clang-format on
 
 #  ifndef _LIBCPP_NO_AUTO_LINK
diff --git a/libcxx/include/__filesystem/directory_iterator.h b/libcxx/include/__filesystem/directory_iterator.h
index 5287a4d8b055fd..a5aa5ff5432dab 100644
--- a/libcxx/include/__filesystem/directory_iterator.h
+++ b/libcxx/include/__filesystem/directory_iterator.h
@@ -29,6 +29,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM)
 
 _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM
@@ -144,4 +147,6 @@ _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY inline constexpr bool
 
 #endif // _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM)
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FILESYSTEM_DIRECTORY_ITERATOR_H
diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h
index 1ff992dd64e6d7..8c7d426f7a6f4f 100644
--- a/libcxx/include/__filesystem/path.h
+++ b/libcxx/include/__filesystem/path.h
@@ -36,6 +36,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM
@@ -925,4 +928,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FILESYSTEM_PATH_H
diff --git a/libcxx/include/__filesystem/recursive_directory_iterator.h b/libcxx/include/__filesystem/recursive_directory_iterator.h
index 7519cc2f2932f2..a8af4f73b14a5f 100644
--- a/libcxx/include/__filesystem/recursive_directory_iterator.h
+++ b/libcxx/include/__filesystem/recursive_directory_iterator.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM)
 
 _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM
@@ -157,4 +160,6 @@ _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY inline constexpr bool
 
 #endif // _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM)
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index 10fca15d5a7a94..34ed9bcd6d63c9 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -30,6 +30,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -289,4 +292,6 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) visit_format_arg(_Visitor&& __vis, basic_fo
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FORMAT_FORMAT_ARG_H
diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h
index 5b252b81f691bc..edb0348b34f363 100644
--- a/libcxx/include/__format/format_context.h
+++ b/libcxx/include/__format/format_context.h
@@ -35,6 +35,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -205,4 +208,6 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(basic_format_context);
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FORMAT_FORMAT_CONTEXT_H
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 015bff70f51d97..cf833ad2055441 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -48,6 +48,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -674,4 +677,6 @@ formatted_size(locale __loc, wformat_string<_Args...> __fmt, _Args&&... __args)
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FORMAT_FORMAT_FUNCTIONS
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index 6802a8b7bd4ca3..46a090a787ae28 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -689,7 +689,7 @@ __format_floating_point(_Tp __value, _FormatContext& __ctx, __format_spec::__par
       // Let P equal the precision if nonzero, 6 if the precision is not
       // specified, or 1 if the precision is 0. Then, if a conversion with
       // style E would have an exponent of X:
-      int __p = std::max(1, (__specs.__has_precision() ? __specs.__precision_ : 6));
+      int __p = std::max<int>(1, (__specs.__has_precision() ? __specs.__precision_ : 6));
       if (__result.__exponent == __result.__last)
         // if P > X >= -4, the conversion is with style f or F and precision P - 1 - X.
         // By including the radix point it calculates P - (1 + X)
diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h
index eebe880d69ef59..d5038eb158b0ad 100644
--- a/libcxx/include/__format/formatter_output.h
+++ b/libcxx/include/__format/formatter_output.h
@@ -35,6 +35,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -325,4 +328,6 @@ _LIBCPP_HIDE_FROM_ABI int __truncate(basic_string_view<_CharT>& __str, int __pre
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FORMAT_FORMATTER_OUTPUT_H
diff --git a/libcxx/include/__format/write_escaped.h b/libcxx/include/__format/write_escaped.h
index ec1283a173e94c..43a074dd8d7002 100644
--- a/libcxx/include/__format/write_escaped.h
+++ b/libcxx/include/__format/write_escaped.h
@@ -30,6 +30,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __formatter {
@@ -218,4 +221,6 @@ __format_escaped_string(basic_string_view<_CharT> __values,
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FORMAT_WRITE_ESCAPED_H
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index 6505bb5871739d..416c26a0c73f2e 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -45,6 +45,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #ifndef _LIBCPP_CXX03_LANG
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -1032,4 +1035,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_CXX03_LANG
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___FUNCTIONAL_FUNCTION_H
diff --git a/libcxx/include/__iterator/cpp17_iterator_concepts.h b/libcxx/include/__iterator/cpp17_iterator_concepts.h
index c4f49fe7422710..d1ad2b4e284808 100644
--- a/libcxx/include/__iterator/cpp17_iterator_concepts.h
+++ b/libcxx/include/__iterator/cpp17_iterator_concepts.h
@@ -29,6 +29,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -182,4 +185,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ITERATOR_CPP17_ITERATOR_CONCEPTS_H
diff --git a/libcxx/include/__iterator/iterator_with_data.h b/libcxx/include/__iterator/iterator_with_data.h
index 06c2fa699c30eb..afdc0a4e12e21c 100644
--- a/libcxx/include/__iterator/iterator_with_data.h
+++ b/libcxx/include/__iterator/iterator_with_data.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -97,4 +100,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ITERATOR_ITERATOR_WITH_DATA_H
diff --git a/libcxx/include/__memory/ranges_uninitialized_algorithms.h b/libcxx/include/__memory/ranges_uninitialized_algorithms.h
index d836d00820a658..90090055bbbbf9 100644
--- a/libcxx/include/__memory/ranges_uninitialized_algorithms.h
+++ b/libcxx/include/__memory/ranges_uninitialized_algorithms.h
@@ -31,6 +31,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -317,4 +320,6 @@ inline constexpr auto uninitialized_move_n = __uninitialized_move_n::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___MEMORY_RANGES_UNINITIALIZED_ALGORITHMS_H
diff --git a/libcxx/include/__memory/raw_storage_iterator.h b/libcxx/include/__memory/raw_storage_iterator.h
index 33790a397c84b6..774878aa1c5e81 100644
--- a/libcxx/include/__memory/raw_storage_iterator.h
+++ b/libcxx/include/__memory/raw_storage_iterator.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_RAW_STORAGE_ITERATOR)
@@ -79,4 +82,6 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 raw_storage_iterator
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___MEMORY_RAW_STORAGE_ITERATOR_H
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 9a73d439306d9e..e6de615d76fa7d 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -61,6 +61,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // NOTE: Relaxed and acq/rel atomics (for increment and decrement respectively)
@@ -1662,4 +1665,6 @@ inline _LIBCPP_HIDE_FROM_ABI bool atomic_compare_exchange_weak_explicit(
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___MEMORY_SHARED_PTR_H
diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h
index 2a4ecf655be287..9aff93a8964863 100644
--- a/libcxx/include/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__memory/uninitialized_algorithms.h
@@ -42,6 +42,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 struct __always_false {
@@ -648,4 +651,6 @@ __uninitialized_allocator_move_if_noexcept(_Alloc&, _Iter1 __first1, _Iter1 __la
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___MEMORY_UNINITIALIZED_ALGORITHMS_H
diff --git a/libcxx/include/__mutex/once_flag.h b/libcxx/include/__mutex/once_flag.h
index 5a6f8e09055f75..9d7baecbc70859 100644
--- a/libcxx/include/__mutex/once_flag.h
+++ b/libcxx/include/__mutex/once_flag.h
@@ -25,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 struct _LIBCPP_TEMPLATE_VIS once_flag;
@@ -151,4 +154,6 @@ inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, const _Callable&
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___MUTEX_ONCE_FLAG_H
diff --git a/libcxx/include/__numeric/pstl_reduce.h b/libcxx/include/__numeric/pstl_reduce.h
index b19972a46db7fa..f9f666c2bb38b8 100644
--- a/libcxx/include/__numeric/pstl_reduce.h
+++ b/libcxx/include/__numeric/pstl_reduce.h
@@ -20,6 +20,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -101,4 +104,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___NUMERIC_PSTL_REDUCE_H
diff --git a/libcxx/include/__numeric/pstl_transform_reduce.h b/libcxx/include/__numeric/pstl_transform_reduce.h
index 1127726046665c..2f412d41f7f27a 100644
--- a/libcxx/include/__numeric/pstl_transform_reduce.h
+++ b/libcxx/include/__numeric/pstl_transform_reduce.h
@@ -22,6 +22,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -148,4 +151,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___NUMERIC_PSTL_TRANSFORM_REDUCE_H
diff --git a/libcxx/include/__numeric/reduce.h b/libcxx/include/__numeric/reduce.h
index 1aeefce132b2b6..6c205bf581fb95 100644
--- a/libcxx/include/__numeric/reduce.h
+++ b/libcxx/include/__numeric/reduce.h
@@ -19,6 +19,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 17
@@ -45,4 +48,6 @@ reduce(_InputIterator __first, _InputIterator __last) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___NUMERIC_REDUCE_H
diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h
index 50274c6bbd9f3a..0e6f455cf22825 100644
--- a/libcxx/include/__numeric/saturation_arithmetic.h
+++ b/libcxx/include/__numeric/saturation_arithmetic.h
@@ -19,6 +19,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 26
@@ -107,4 +110,6 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Rp saturate_cast(_Tp __x) noexcept {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H
diff --git a/libcxx/include/__numeric/transform_reduce.h b/libcxx/include/__numeric/transform_reduce.h
index 6c0a81e5e4b099..f1150510f0c36f 100644
--- a/libcxx/include/__numeric/transform_reduce.h
+++ b/libcxx/include/__numeric/transform_reduce.h
@@ -18,6 +18,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 17
@@ -51,4 +54,6 @@ transform_reduce(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterat
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___NUMERIC_TRANSFORM_REDUCE_H
diff --git a/libcxx/include/__ranges/counted.h b/libcxx/include/__ranges/counted.h
index 337634895766ba..83d76f8fd21068 100644
--- a/libcxx/include/__ranges/counted.h
+++ b/libcxx/include/__ranges/counted.h
@@ -29,6 +29,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -82,4 +85,6 @@ inline constexpr auto counted = __counted::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_COUNTED_H
diff --git a/libcxx/include/__ranges/drop_while_view.h b/libcxx/include/__ranges/drop_while_view.h
index 4e3ef61678f4d7..92f48bd0ecfba3 100644
--- a/libcxx/include/__ranges/drop_while_view.h
+++ b/libcxx/include/__ranges/drop_while_view.h
@@ -37,6 +37,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -128,4 +131,6 @@ inline constexpr auto drop_while = __drop_while::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_DROP_WHILE_VIEW_H
diff --git a/libcxx/include/__ranges/elements_view.h b/libcxx/include/__ranges/elements_view.h
index 325e4c9dca6399..989d36fbcaaae5 100644
--- a/libcxx/include/__ranges/elements_view.h
+++ b/libcxx/include/__ranges/elements_view.h
@@ -43,6 +43,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -410,4 +413,6 @@ inline constexpr auto values   = elements<1>;
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_ELEMENTS_VIEW_H
diff --git a/libcxx/include/__ranges/filter_view.h b/libcxx/include/__ranges/filter_view.h
index 6e6719c14470da..5b938dd4c16e19 100644
--- a/libcxx/include/__ranges/filter_view.h
+++ b/libcxx/include/__ranges/filter_view.h
@@ -44,6 +44,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -252,4 +255,6 @@ inline constexpr auto filter = __filter::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_FILTER_VIEW_H
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index c6c9618cfe6c1d..c8314dd848b447 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -41,6 +41,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -395,4 +398,6 @@ inline constexpr auto iota = __iota::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_IOTA_VIEW_H
diff --git a/libcxx/include/__ranges/join_view.h b/libcxx/include/__ranges/join_view.h
index 22473059133f70..9c2c77995539bd 100644
--- a/libcxx/include/__ranges/join_view.h
+++ b/libcxx/include/__ranges/join_view.h
@@ -41,6 +41,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -415,4 +418,6 @@ struct __segmented_iterator_traits<_JoinViewIterator> {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_JOIN_VIEW_H
diff --git a/libcxx/include/__ranges/lazy_split_view.h b/libcxx/include/__ranges/lazy_split_view.h
index e96398b14b58aa..6aedfdabffe3a8 100644
--- a/libcxx/include/__ranges/lazy_split_view.h
+++ b/libcxx/include/__ranges/lazy_split_view.h
@@ -47,6 +47,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -433,4 +436,6 @@ inline constexpr auto lazy_split = __lazy_split_view::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_LAZY_SPLIT_VIEW_H
diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h
index d9759abe1cba6b..d08f0e0d4e9f74 100644
--- a/libcxx/include/__ranges/repeat_view.h
+++ b/libcxx/include/__ranges/repeat_view.h
@@ -34,6 +34,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 23
@@ -257,4 +260,6 @@ inline constexpr bool __is_repeat_specialization<repeat_view<_Tp, _Bound>> = tru
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_REPEAT_VIEW_H
diff --git a/libcxx/include/__ranges/reverse_view.h b/libcxx/include/__ranges/reverse_view.h
index f7846259810c92..ddbe8908414f9b 100644
--- a/libcxx/include/__ranges/reverse_view.h
+++ b/libcxx/include/__ranges/reverse_view.h
@@ -33,6 +33,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -196,4 +199,6 @@ inline constexpr auto reverse = __reverse::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_REVERSE_VIEW_H
diff --git a/libcxx/include/__ranges/single_view.h b/libcxx/include/__ranges/single_view.h
index ead597a9be170d..f91c7c35263676 100644
--- a/libcxx/include/__ranges/single_view.h
+++ b/libcxx/include/__ranges/single_view.h
@@ -26,6 +26,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -101,4 +104,6 @@ inline constexpr auto single = __single_view::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_SINGLE_VIEW_H
diff --git a/libcxx/include/__ranges/split_view.h b/libcxx/include/__ranges/split_view.h
index 7f03be3c346a42..98f17be04f628f 100644
--- a/libcxx/include/__ranges/split_view.h
+++ b/libcxx/include/__ranges/split_view.h
@@ -36,6 +36,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -224,4 +227,6 @@ inline constexpr auto split = __split_view::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_SPLIT_VIEW_H
diff --git a/libcxx/include/__ranges/take_while_view.h b/libcxx/include/__ranges/take_while_view.h
index 46cfe4f70ac834..77ea9f7bb81316 100644
--- a/libcxx/include/__ranges/take_while_view.h
+++ b/libcxx/include/__ranges/take_while_view.h
@@ -35,6 +35,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -162,4 +165,6 @@ inline constexpr auto take_while = __take_while::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_TAKE_WHILE_VIEW_H
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index 3c8d825789cbc9..dc3aaa59ed8c3f 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -47,6 +47,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
@@ -416,4 +419,6 @@ inline constexpr auto transform = __transform::__fn{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___RANGES_TRANSFORM_VIEW_H
diff --git a/libcxx/include/__thread/jthread.h b/libcxx/include/__thread/jthread.h
index fc86b13afb1343..2fbc8a36755e96 100644
--- a/libcxx/include/__thread/jthread.h
+++ b/libcxx/include/__thread/jthread.h
@@ -28,6 +28,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_STOP_TOKEN)
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -127,4 +130,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_STOP_TOKEN)
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___THREAD_JTHREAD_H
diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h
index 463bbd6772552c..0ecaac1b011bee 100644
--- a/libcxx/include/__thread/thread.h
+++ b/libcxx/include/__thread/thread.h
@@ -32,6 +32,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
@@ -251,4 +254,6 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(thread& __x, thread& __y) _NOEXCEPT { __x
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___THREAD_THREAD_H
diff --git a/libcxx/include/array b/libcxx/include/array
index dcb419f536dc50..41f016a4859a32 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -159,6 +159,9 @@ template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexce
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, size_t _Size>
@@ -493,6 +496,8 @@ to_array(_Tp (&&__arr)[_Size]) noexcept(is_nothrow_move_constructible_v<_Tp>) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <algorithm>
 #  include <concepts>
diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable
index e375c986e7f12e..6aac3c13ef4a74 100644
--- a/libcxx/include/condition_variable
+++ b/libcxx/include/condition_variable
@@ -139,6 +139,9 @@ public:
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #ifndef _LIBCPP_HAS_NO_THREADS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -348,6 +351,8 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !_LIBCPP_HAS_NO_THREADS
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <atomic>
 #  include <concepts>
diff --git a/libcxx/include/csetjmp b/libcxx/include/csetjmp
index d219c8e6cb2250..9012cad22ebe74 100644
--- a/libcxx/include/csetjmp
+++ b/libcxx/include/csetjmp
@@ -33,7 +33,13 @@ void longjmp(jmp_buf env, int val);
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
-#include <setjmp.h>
+// <setjmp.h> is not provided by libc++
+#if __has_include(<setjmp.h>)
+#  include <setjmp.h>
+#  ifdef _LIBCPP_SETJMP_H
+#    error "If libc++ starts defining <setjmp.h>, the __has_include check should move to libc++'s <setjmp.h>"
+#  endif
+#endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator
index 5bb1dd1ada6380..e9c1fb6924eced 100644
--- a/libcxx/include/experimental/iterator
+++ b/libcxx/include/experimental/iterator
@@ -64,6 +64,9 @@ namespace std {
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 14
 
 _LIBCPP_BEGIN_NAMESPACE_LFTS
@@ -115,6 +118,8 @@ _LIBCPP_END_NAMESPACE_LFTS
 
 #endif // _LIBCPP_STD_VER >= 14
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <iosfwd>
 #  include <type_traits>
diff --git a/libcxx/include/future b/libcxx/include/future
index 5602ae41c14235..4eeb401c9bbcda 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -401,6 +401,9 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // enum class future_errc
@@ -2044,6 +2047,8 @@ inline shared_future<void> future<void>::share() _NOEXCEPT { return shared_futur
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17
 #  include <chrono>
 #endif
diff --git a/libcxx/include/ios b/libcxx/include/ios
index d36f5fb2ca2842..8465860d08dc14 100644
--- a/libcxx/include/ios
+++ b/libcxx/include/ios
@@ -242,6 +242,9 @@ storage-class-specifier const error_category& iostream_category() noexcept;
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 typedef ptrdiff_t streamsize;
@@ -820,6 +823,8 @@ _LIBCPP_HIDE_FROM_ABI inline ios_base& defaultfloat(ios_base& __str) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <atomic>
 #  include <concepts>
diff --git a/libcxx/include/map b/libcxx/include/map
index f122f2ebb15b52..2edbc0cf6245fb 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -617,6 +617,9 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Key,
@@ -2182,6 +2185,8 @@ using multimap _LIBCPP_AVAILABILITY_PMR =
 _LIBCPP_END_NAMESPACE_STD
 #endif
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <concepts>
 #  include <cstdlib>
diff --git a/libcxx/include/ostream b/libcxx/include/ostream
index e2b2c0cbaaf254..180adda201d830 100644
--- a/libcxx/include/ostream
+++ b/libcxx/include/ostream
@@ -199,6 +199,9 @@ void vprint_nonunicode(ostream& os, string_view fmt, format_args args);
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _CharT, class _Traits>
@@ -1169,6 +1172,8 @@ println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <atomic>
 #  include <concepts>
diff --git a/libcxx/include/print b/libcxx/include/print
index 7f2b5bac3dcf61..543a540ee4f27d 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -32,6 +32,7 @@ namespace std {
 */
 
 #include <__assert> // all public C++ headers provide the assertion handler
+#include <__availability>
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__system_error/system_error.h>
@@ -43,10 +44,6 @@ namespace std {
 #include <string_view>
 #include <version>
 
-#if __has_include(<unistd.h>)
-#  include <unistd.h>
-#endif
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
@@ -68,7 +65,8 @@ _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream);
 // Note the function is only implemented on the Windows platform.
 _LIBCPP_EXPORTED_FROM_ABI void __write_to_windows_console(FILE* __stream, wstring_view __view);
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
-
+#elif __has_include(<unistd.h>)
+_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream);
 #endif // _LIBCPP_WIN32API
 
 #if _LIBCPP_STD_VER >= 23
@@ -195,15 +193,17 @@ inline constexpr bool __use_unicode_execution_charset = _MSVC_EXECUTION_CHARACTE
 inline constexpr bool __use_unicode_execution_charset = true;
 #  endif
 
-_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal(FILE* __stream) {
+_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal([[maybe_unused]] FILE* __stream) {
   // The macro _LIBCPP_TESTING_PRINT_IS_TERMINAL is used to change
   // the behavior in the test. This is not part of the public API.
 #  ifdef _LIBCPP_TESTING_PRINT_IS_TERMINAL
   return _LIBCPP_TESTING_PRINT_IS_TERMINAL(__stream);
+#  elif _LIBCPP_AVAILABILITY_HAS_PRINT == 0
+  return false;
 #  elif defined(_LIBCPP_WIN32API)
   return std::__is_windows_terminal(__stream);
 #  elif __has_include(<unistd.h>)
-  return isatty(fileno(__stream));
+  return std::__is_posix_terminal(__stream);
 #  else
 #    error "Provide a way to determine whether a FILE* is a terminal"
 #  endif
diff --git a/libcxx/include/queue b/libcxx/include/queue
index 692e38bb35229f..76ef85945662c4 100644
--- a/libcxx/include/queue
+++ b/libcxx/include/queue
@@ -283,6 +283,9 @@ template <class T, class Container, class Compare>
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Container = deque<_Tp> >
@@ -971,6 +974,8 @@ struct _LIBCPP_TEMPLATE_VIS uses_allocator<priority_queue<_Tp, _Container, _Comp
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <concepts>
 #  include <cstdlib>
diff --git a/libcxx/include/set b/libcxx/include/set
index 55ba8f8208be1b..7f8245f8b605ab 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -552,6 +552,9 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Key, class _Compare, class _Allocator>
@@ -1488,6 +1491,8 @@ using multiset _LIBCPP_AVAILABILITY_PMR = std::multiset<_KeyT, _CompareT, polymo
 _LIBCPP_END_NAMESPACE_STD
 #endif
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <concepts>
 #  include <cstdlib>
diff --git a/libcxx/include/stack b/libcxx/include/stack
index 546380b0aacd07..f1f6ee8482fd21 100644
--- a/libcxx/include/stack
+++ b/libcxx/include/stack
@@ -138,6 +138,9 @@ template <class T, class Container>
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Container = deque<_Tp> >
@@ -366,6 +369,8 @@ struct _LIBCPP_TEMPLATE_VIS uses_allocator<stack<_Tp, _Container>, _Alloc> : pub
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <concepts>
 #  include <functional>
diff --git a/libcxx/include/stddef.h b/libcxx/include/stddef.h
index 887776b150e49d..1583e78e3739ba 100644
--- a/libcxx/include/stddef.h
+++ b/libcxx/include/stddef.h
@@ -7,18 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if defined(__need_ptrdiff_t) || defined(__need_size_t) || defined(__need_wchar_t) || defined(__need_NULL) ||          \
-    defined(__need_wint_t)
-
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
-
-#  include_next <stddef.h>
-
-#elif !defined(_LIBCPP_STDDEF_H)
-#  define _LIBCPP_STDDEF_H
-
 /*
     stddef.h synopsis
 
@@ -36,15 +24,18 @@
 
 */
 
-#  include <__config>
+#include <__config>
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
-#  if __has_include_next(<stddef.h>)
-#    include_next <stddef.h>
-#  endif
+// Note: This include is outside of header guards because we sometimes get included multiple times
+//       with different defines and the underlying <stddef.h> will know how to deal with that.
+#include_next <stddef.h>
+
+#ifndef _LIBCPP_STDDEF_H
+#  define _LIBCPP_STDDEF_H
 
 #  ifdef __cplusplus
 typedef decltype(nullptr) nullptr_t;
diff --git a/libcxx/include/string b/libcxx/include/string
index e97139206d4fa7..ba169c3dbfc9e6 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -1937,12 +1937,7 @@ private:
     return (__s + (__a - 1)) & ~(__a - 1);
   }
   enum {
-    __alignment =
-#ifdef _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
-        8
-#else
-        16
-#endif
+    __alignment = 8
   };
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __recommend(size_type __s) _NOEXCEPT {
     if (__s < __min_cap) {
diff --git a/libcxx/include/strstream b/libcxx/include/strstream
index 7843184e4da4f8..e20c86baa6dfc5 100644
--- a/libcxx/include/strstream
+++ b/libcxx/include/strstream
@@ -139,6 +139,9 @@ private:
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 class _LIBCPP_DEPRECATED _LIBCPP_EXPORTED_FROM_ABI strstreambuf : public streambuf {
@@ -340,4 +343,6 @@ private:
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP_STRSTREAM
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index 4be25fc1cdd8fe..2c1782dc879e65 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -625,6 +625,9 @@ template <class Key, class T, class Hash, class Pred, class Alloc>
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Key,
@@ -2544,6 +2547,8 @@ using unordered_multimap _LIBCPP_AVAILABILITY_PMR =
 _LIBCPP_END_NAMESPACE_STD
 #endif
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <algorithm>
 #  include <bit>
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index 6414885f4c514a..50b616907f0052 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -570,6 +570,9 @@ template <class Value, class Hash, class Pred, class Alloc>
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Value, class _Hash, class _Pred, class _Alloc>
@@ -1810,6 +1813,8 @@ using unordered_multiset _LIBCPP_AVAILABILITY_PMR =
 _LIBCPP_END_NAMESPACE_STD
 #endif
 
+_LIBCPP_POP_MACROS
+
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <concepts>
 #  include <cstdlib>
diff --git a/libcxx/include/version b/libcxx/include/version
index 9e26da8c1b2425..d356976d6454ad 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -266,7 +266,9 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 # define __cpp_lib_make_reverse_iterator                201402L
 # define __cpp_lib_make_unique                          201304L
 # define __cpp_lib_null_iterators                       201304L
-# define __cpp_lib_quoted_string_io                     201304L
+# if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   define __cpp_lib_quoted_string_io                   201304L
+# endif
 # define __cpp_lib_result_of_sfinae                     201210L
 # define __cpp_lib_robust_nonmodifying_seq_ops          201304L
 # if !defined(_LIBCPP_HAS_NO_THREADS)
@@ -294,7 +296,7 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 # define __cpp_lib_clamp                                201603L
 # define __cpp_lib_enable_shared_from_this              201603L
 // # define __cpp_lib_execution                            201603L
-# if _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
+# if !defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
 #   define __cpp_lib_filesystem                         201703L
 # endif
 # define __cpp_lib_gcd_lcm                              201606L
@@ -323,7 +325,9 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 // # define __cpp_lib_parallel_algorithm                   201603L
 # define __cpp_lib_raw_memory_algorithms                201606L
 # define __cpp_lib_sample                               201603L
-# define __cpp_lib_scoped_lock                          201703L
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   define __cpp_lib_scoped_lock                        201703L
+# endif
 # if !defined(_LIBCPP_HAS_NO_THREADS)
 #   define __cpp_lib_shared_mutex                       201505L
 # endif
@@ -496,7 +500,9 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 // # define __cpp_lib_freestanding_optional                202311L
 // # define __cpp_lib_freestanding_string_view             202311L
 // # define __cpp_lib_freestanding_variant                 202311L
-# define __cpp_lib_fstream_native_handle                202306L
+# if !defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   define __cpp_lib_fstream_native_handle              202306L
+# endif
 // # define __cpp_lib_function_ref                         202306L
 // # define __cpp_lib_hazard_pointer                       202306L
 // # define __cpp_lib_linalg                               202311L
diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT
index 1179c253f18c8f..7ff604959f4d5c 100644
--- a/libcxx/lib/abi/CHANGELOG.TXT
+++ b/libcxx/lib/abi/CHANGELOG.TXT
@@ -16,6 +16,14 @@ New entries should be added directly below the "Version" header.
 Version 18.0
 ------------
 
+* [libc++] Moves is_terminal to the dylib
+
+  The patch moves the POSIX implementation of is_terminal to the dylib. This is
+  needed to avoid using <unistd.h> in public headers.
+
+  All platforms
+  Symbol added: _ZNSt6__ndk119__is_posix_terminalEP7__sFILE
+
 * [libc++abi] Implement __cxa_init_primary_exception and use it to optimize std::make_exception_ptr (#65534)
 
   This patch implements __cxa_init_primary_exception, an extension to the Itanium C++ ABI.
diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index c2fea4d8adb420..2064f45bf8c084 100644
--- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1495,6 +1495,7 @@
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index a60f099b532052..fec3a4505a0c6d 100644
--- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1176,6 +1176,7 @@
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index a159ff52218667..e52cf98dd4c4f1 100644
--- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -534,6 +534,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index 5749a7520f9bac..52a04706ddf20b 100644
--- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -534,6 +534,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index e827114f169197..bced6b2ea81ba5 100644
--- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1495,6 +1495,7 @@
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index f4077adc074e0a..efa2189e9c9287 100644
--- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1176,6 +1176,7 @@
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
index e3d3fcb35d8403..ebda5b0dfba57d 100644
--- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1190,6 +1190,7 @@
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
index 16923301d2548e..6432ad3be35859 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1188,6 +1188,7 @@
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
index 2380ffb100de97..1fe84e17b3f7f0 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
@@ -1159,6 +1159,7 @@
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index 0388c048dacb8b..d47d19a4755317 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -137,6 +137,25 @@ set(LIBCXX_MODULE_STD_COMPAT_SOURCES
   std.compat/cwctype.inc
 )
 
+# TODO MODULES the CMakeLists.txt in the build directory is only temporary.
+# This allows using as available in the build directory. Once build systems
+# have proper support for the installed files this will be removed.
+if ("${LIBCXX_GENERATED_INCLUDE_DIR}" STREQUAL "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR}")
+  # This typically happens when the target is not installed.
+  set(LIBCXX_CONFIGURED_INCLUDE_DIRS "${LIBCXX_GENERATED_INCLUDE_DIR}")
+else()
+  # It's important that the arch directory be included first so that its header files
+  # which interpose on the default include dir be included instead of the default ones.
+  set(LIBCXX_CONFIGURED_INCLUDE_DIRS
+    "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR};${LIBCXX_GENERATED_INCLUDE_DIR}"
+  )
+endif()
+configure_file(
+  "CMakeLists.txt.in"
+  "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt"
+  @ONLY
+)
+
 set(LIBCXX_MODULE_STD_INCLUDE_SOURCES)
 foreach(file ${LIBCXX_MODULE_STD_SOURCES})
   set(
@@ -166,6 +185,7 @@ configure_file(
 )
 
 set(_all_modules)
+list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.cppm")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.compat.cppm")
 foreach(file ${LIBCXX_MODULE_STD_SOURCES} ${LIBCXX_MODULE_STD_COMPAT_SOURCES})
@@ -186,9 +206,20 @@ add_custom_target(generate-cxx-modules
 # Configure the modules manifest.
 # Use the relative path between the installation and the module in the json
 # file. This allows moving the entire installation to a different location.
+if("${CMAKE_INSTALL_PREFIX}" STREQUAL "")
+  set(BASE_DIRECTORY "/")
+else()
+  set(BASE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
+endif()
+cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_LIBRARY_DIR
+  BASE_DIRECTORY ${BASE_DIRECTORY}
+  OUTPUT_VARIABLE ABS_LIBRARY_DIR)
+cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_MODULES_DIR
+  BASE_DIRECTORY ${BASE_DIRECTORY}
+  OUTPUT_VARIABLE ABS_MODULES_DIR)
 file(RELATIVE_PATH LIBCXX_MODULE_RELATIVE_PATH
-  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_LIBRARY_DIR}
-  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_MODULES_DIR})
+  ${ABS_LIBRARY_DIR}
+  ${ABS_MODULES_DIR})
 configure_file(
   "modules.json.in"
   "${LIBCXX_LIBRARY_DIR}/libc++.modules.json"
diff --git a/libcxx/modules/CMakeLists.txt.in b/libcxx/modules/CMakeLists.txt.in
new file mode 100644
index 00000000000000..e332d70cc16333
--- /dev/null
+++ b/libcxx/modules/CMakeLists.txt.in
@@ -0,0 +1,88 @@
+cmake_minimum_required(VERSION 3.26)
+
+project(libc++-modules LANGUAGES CXX)
+
+# Enable CMake's module support
+if(CMAKE_VERSION VERSION_LESS "3.28.0")
+  if(CMAKE_VERSION VERSION_LESS "3.27.0")
+    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "2182bf5c-ef0d-489a-91da-49dbc3090d2a")
+  else()
+    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "aa1f7df0-828a-4fcd-9afc-2dc80491aca7")
+  endif()
+  set(CMAKE_EXPERIMENTAL_CXX_MODULE_DYNDEP 1)
+else()
+  cmake_policy(VERSION 3.28)
+endif()
+
+# Default to C++ extensions being off. Libc++'s modules support have trouble
+# with extensions right now.
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Propagates the CMake options to the modules.
+#
+# This uses the std module hard-coded since the std.compat module does not
+# depend on these flags.
+macro(compile_define_if_not condition def)
+  if (NOT ${condition})
+    target_compile_definitions(std PRIVATE ${def})
+  endif()
+endmacro()
+macro(compile_define_if condition def)
+  if (${condition})
+    target_compile_definitions(std PRIVATE ${def})
+  endif()
+endmacro()
+
+### STD
+
+add_library(std)
+target_sources(std
+  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
+    std.cppm
+)
+
+target_include_directories(std SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+
+if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
+  target_compile_options(std PUBLIC -fno-exceptions)
+endif()
+
+target_compile_options(std
+  PUBLIC
+    -nostdinc++
+    -Wno-reserved-module-identifier
+    -Wno-reserved-user-defined-literal
+    @LIBCXX_COMPILE_FLAGS@
+)
+set_target_properties(std
+  PROPERTIES
+    OUTPUT_NAME   "c++std"
+)
+
+### STD.COMPAT
+
+add_library(std.compat)
+target_sources(std.compat
+  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
+    std.compat.cppm
+)
+
+target_include_directories(std.compat SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+
+if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
+  target_compile_options(std.compat PUBLIC -fno-exceptions)
+endif()
+
+target_compile_options(std.compat
+  PUBLIC
+    -nostdinc++
+    -Wno-reserved-module-identifier
+    -Wno-reserved-user-defined-literal
+	-fmodule-file=std=${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/std.dir/std.pcm
+    @LIBCXX_COMPILE_FLAGS@
+)
+set_target_properties(std.compat
+  PROPERTIES
+    OUTPUT_NAME   "c++std.compat"
+)
+add_dependencies(std.compat std)
diff --git a/libcxx/modules/modules.json.in b/libcxx/modules/modules.json.in
index ddc377f28f9194..759ac92d81f18e 100644
--- a/libcxx/modules/modules.json.in
+++ b/libcxx/modules/modules.json.in
@@ -5,7 +5,7 @@
     {
       "logical-name": "std",
       "source-path": "@LIBCXX_MODULE_RELATIVE_PATH@/std.cppm",
-      "is-standard-library": true,
+      "is-std-library": true,
       "local-arguments": {
         "system-include-directories": [
           "@LIBCXX_MODULE_RELATIVE_PATH@"
diff --git a/libcxx/modules/std.compat/cstdlib.inc b/libcxx/modules/std.compat/cstdlib.inc
index a45a0a1caf8ba9..4783cbf5162390 100644
--- a/libcxx/modules/std.compat/cstdlib.inc
+++ b/libcxx/modules/std.compat/cstdlib.inc
@@ -25,7 +25,7 @@ export {
   using ::system;
 
   // [c.malloc], C library memory allocation
-  using ::aligned_alloc;
+  using ::aligned_alloc _LIBCPP_USING_IF_EXISTS;
   using ::calloc;
   using ::free;
   using ::malloc;
diff --git a/libcxx/modules/std/atomic.inc b/libcxx/modules/std/atomic.inc
index 5139b7531093d7..88b31ccdb20840 100644
--- a/libcxx/modules/std/atomic.inc
+++ b/libcxx/modules/std/atomic.inc
@@ -60,7 +60,9 @@ export namespace std {
   using std::atomic_char;
   using std::atomic_char16_t;
   using std::atomic_char32_t;
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
   using std::atomic_char8_t;
+#endif
   using std::atomic_int;
   using std::atomic_llong;
   using std::atomic_long;
diff --git a/libcxx/modules/std/iosfwd.inc b/libcxx/modules/std/iosfwd.inc
index ec8b434ca0c51b..410fb6aefed801 100644
--- a/libcxx/modules/std/iosfwd.inc
+++ b/libcxx/modules/std/iosfwd.inc
@@ -14,7 +14,9 @@ export namespace std {
 #endif
   using std::u16streampos;
   using std::u32streampos;
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
   using std::u8streampos;
+#endif
 
   using std::basic_osyncstream;
   using std::basic_syncbuf;
diff --git a/libcxx/modules/std/ostream.inc b/libcxx/modules/std/ostream.inc
index 8fcbfb4bdc1828..0e0e2d54fe6bae 100644
--- a/libcxx/modules/std/ostream.inc
+++ b/libcxx/modules/std/ostream.inc
@@ -33,8 +33,10 @@ export namespace std {
   using std::println;
 
   using std::vprint_nonunicode;
+#    ifndef _LIBCPP_HAS_NO_UNICODE
   using std::vprint_unicode;
-#  endif // _LIBCPP_STD_VER >= 23
+#    endif // _LIBCPP_HAS_NO_UNICODE
+#  endif   // _LIBCPP_STD_VER >= 23
 
 #endif // _LIBCPP_HAS_NO_LOCALIZATION
 } // namespace std
diff --git a/libcxx/modules/std/string.inc b/libcxx/modules/std/string.inc
index c83ee7643f87e9..9808a96215a182 100644
--- a/libcxx/modules/std/string.inc
+++ b/libcxx/modules/std/string.inc
@@ -34,7 +34,9 @@ export namespace std {
   using std::string;
   using std::u16string;
   using std::u32string;
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
   using std::u8string;
+#endif
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
   using std::wstring;
 #endif
@@ -58,7 +60,9 @@ export namespace std {
     using std::pmr::string;
     using std::pmr::u16string;
     using std::pmr::u32string;
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
     using std::pmr::u8string;
+#endif
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
     using std::pmr::wstring;
 #endif
diff --git a/libcxx/modules/std/string_view.inc b/libcxx/modules/std/string_view.inc
index 1fa63a77395358..f4f9d80ddb83da 100644
--- a/libcxx/modules/std/string_view.inc
+++ b/libcxx/modules/std/string_view.inc
@@ -27,7 +27,9 @@ export namespace std {
   using std::string_view;
   using std::u16string_view;
   using std::u32string_view;
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
   using std::u8string_view;
+#endif
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
   using std::wstring_view;
 #endif
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 44a088663463c9..1b80625304a412 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -306,7 +306,10 @@ if (LIBCXX_ENABLE_STATIC)
     # then its code shouldn't declare them with hidden visibility.  They might
     # actually be provided by a shared library at link time.
     if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
-      append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden)
+      append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete=force-hidden)
+      if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+        append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden)
+      endif()
     endif()
     target_compile_options(cxx_static PRIVATE ${CXX_STATIC_LIBRARY_FLAGS})
     # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in __config_site
diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp
index 3692187a5954a3..8fa59fdd097bcd 100644
--- a/libcxx/src/print.cpp
+++ b/libcxx/src/print.cpp
@@ -8,22 +8,26 @@
 
 #include <__config>
 
-#if defined(_LIBCPP_WIN32API)
+#include <cstdlib>
+#include <print>
+
+#include <__system_error/system_error.h>
 
-#  include <cstdlib>
-#  include <print>
+#include "filesystem/error.h"
 
+#if defined(_LIBCPP_WIN32API)
 #  define WIN32_LEAN_AND_MEAN
 #  define NOMINMAX
 #  include <io.h>
 #  include <windows.h>
-
-#  include <__system_error/system_error.h>
-
-#  include "filesystem/error.h"
+#elif __has_include(<unistd.h>)
+#  include <unistd.h>
+#endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#if defined(_LIBCPP_WIN32API)
+
 _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream) {
   // Note the Standard does this in one call, but it's unclear whether
   // an invalid handle is allowed when calling GetConsoleMode.
@@ -52,6 +56,9 @@ __write_to_windows_console([[maybe_unused]] FILE* __stream, [[maybe_unused]] wst
 }
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
-_LIBCPP_END_NAMESPACE_STD
+#elif __has_include(<unistd.h>) // !_LIBCPP_WIN32API
 
-#endif // !_LIBCPP_WIN32API
+_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream) { return isatty(fileno(__stream)); }
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp
index c7df56c815a805..1110e3d3ec568a 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.13|10.15|11.0}}
+
 // <string>
 
 // This test demonstrates the smaller allocation sizes when the alignment
@@ -17,14 +19,8 @@
 
 #include "test_macros.h"
 
-// alignment of the string heap buffer is hardcoded to either 16 or 8
-
-const std::size_t alignment =
-#ifdef _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
-    8;
-#else
-    16;
-#endif
+// alignment of the string heap buffer is hardcoded to 8
+const std::size_t alignment = 8;
 
 int main(int, char**) {
   std::string input_string;
diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp
index a3cb79522f2e1e..726570beb6d1ae 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp
@@ -17,14 +17,8 @@
 
 #include "test_macros.h"
 
-// alignment of the string heap buffer is hardcoded to 16
-
-static const std::size_t alignment =
-#ifdef _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
-    8;
-#else
-    16;
-#endif
+// alignment of the string heap buffer is hardcoded to 8
+static const std::size_t alignment = 8;
 
 template <class = int>
 TEST_CONSTEXPR_CXX20 void full_size() {
diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py
index 8ddd035ff2d258..5b75dba544ef20 100644
--- a/libcxx/test/libcxx/system_reserved_names.gen.py
+++ b/libcxx/test/libcxx/system_reserved_names.gen.py
@@ -7,7 +7,8 @@
 #===----------------------------------------------------------------------===##
 
 # Test that headers are not tripped up by the surrounding code defining various
-# alphabetic macros.
+# alphabetic macros. Also ensure that we don't swallow the definition of user
+# provided macros (in other words, ensure that we push/pop correctly everywhere).
 
 # RUN: %{python} %s %{libcxx}/utils
 
@@ -162,4 +163,13 @@
 #define refresh SYSTEM_RESERVED_NAME
 
 #include <{header}>
+
+// Make sure we don't swallow the definition of the macros we push/pop
+#define STRINGIFY_IMPL(x) #x
+#define STRINGIFY(x) STRINGIFY_IMPL(x)
+static_assert(__builtin_strcmp(STRINGIFY(min), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, "");
+static_assert(__builtin_strcmp(STRINGIFY(max), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, "");
+static_assert(__builtin_strcmp(STRINGIFY(move), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, "");
+static_assert(__builtin_strcmp(STRINGIFY(erase), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, "");
+static_assert(__builtin_strcmp(STRINGIFY(refresh), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, "");
 """)
diff --git a/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
new file mode 100644
index 00000000000000..c55a0a4d6e5d1b
--- /dev/null
+++ b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: stdlib=apple-libc++
+
+// Test that using -pedantic-errors doesn't turn off availability annotations.
+// This used to be the case because we used __has_extension(...) to enable the
+// availability annotations, and -pedantic-errors changes the behavior of
+// __has_extension(...) in an incompatible way.
+
+// ADDITIONAL_COMPILE_FLAGS: -pedantic-errors
+
+#include <__availability>
+
+#if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
+#  error Availability annotations should be enabled on Apple platforms in the system configuration!
+#endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
index 46ccde800c1796..3f03e8be9aeab3 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
@@ -51,7 +51,7 @@
 #   error "__cpp_lib_char8_t should not be defined before c++20"
 # endif
 
-# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)
 #   ifndef __cpp_lib_filesystem
 #     error "__cpp_lib_filesystem should be defined in c++17"
 #   endif
@@ -60,7 +60,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_filesystem
-#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!"
+#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!"
 #   endif
 # endif
 
@@ -79,7 +79,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)
 #   ifndef __cpp_lib_filesystem
 #     error "__cpp_lib_filesystem should be defined in c++20"
 #   endif
@@ -88,7 +88,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_filesystem
-#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!"
+#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!"
 #   endif
 # endif
 
@@ -107,7 +107,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)
 #   ifndef __cpp_lib_filesystem
 #     error "__cpp_lib_filesystem should be defined in c++23"
 #   endif
@@ -116,7 +116,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_filesystem
-#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!"
+#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!"
 #   endif
 # endif
 
@@ -135,7 +135,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)
 #   ifndef __cpp_lib_filesystem
 #     error "__cpp_lib_filesystem should be defined in c++26"
 #   endif
@@ -144,7 +144,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_filesystem
-#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!"
+#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!"
 #   endif
 # endif
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp
index eab0313b2c1ef7..f2c31b60a92f90 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp
@@ -56,11 +56,17 @@
 
 #elif TEST_STD_VER > 23
 
-# ifndef __cpp_lib_fstream_native_handle
-#   error "__cpp_lib_fstream_native_handle should be defined in c++26"
-# endif
-# if __cpp_lib_fstream_native_handle != 202306L
-#   error "__cpp_lib_fstream_native_handle should have the value 202306L in c++26"
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION))
+#   ifndef __cpp_lib_fstream_native_handle
+#     error "__cpp_lib_fstream_native_handle should be defined in c++26"
+#   endif
+#   if __cpp_lib_fstream_native_handle != 202306L
+#     error "__cpp_lib_fstream_native_handle should have the value 202306L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_fstream_native_handle
+#     error "__cpp_lib_fstream_native_handle should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION))' is not met!"
+#   endif
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp
index b678e6804e959c..da9970fd162787 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp
@@ -32,47 +32,77 @@
 
 #elif TEST_STD_VER == 14
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++14"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++14"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++14"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++14"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 #elif TEST_STD_VER == 17
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++17"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++17"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++17"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++17"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 #elif TEST_STD_VER == 20
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++20"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++20"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++20"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++20"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 #elif TEST_STD_VER == 23
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++23"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++23"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++23"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++23"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 #elif TEST_STD_VER > 23
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++26"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++26"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++26"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp
index e86239f534790e..0f279973dccd8a 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp
@@ -38,38 +38,62 @@
 
 #elif TEST_STD_VER == 17
 
-# ifndef __cpp_lib_scoped_lock
-#   error "__cpp_lib_scoped_lock should be defined in c++17"
-# endif
-# if __cpp_lib_scoped_lock != 201703L
-#   error "__cpp_lib_scoped_lock should have the value 201703L in c++17"
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   ifndef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should be defined in c++17"
+#   endif
+#   if __cpp_lib_scoped_lock != 201703L
+#     error "__cpp_lib_scoped_lock should have the value 201703L in c++17"
+#   endif
+# else
+#   ifdef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
+#   endif
 # endif
 
 #elif TEST_STD_VER == 20
 
-# ifndef __cpp_lib_scoped_lock
-#   error "__cpp_lib_scoped_lock should be defined in c++20"
-# endif
-# if __cpp_lib_scoped_lock != 201703L
-#   error "__cpp_lib_scoped_lock should have the value 201703L in c++20"
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   ifndef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should be defined in c++20"
+#   endif
+#   if __cpp_lib_scoped_lock != 201703L
+#     error "__cpp_lib_scoped_lock should have the value 201703L in c++20"
+#   endif
+# else
+#   ifdef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
+#   endif
 # endif
 
 #elif TEST_STD_VER == 23
 
-# ifndef __cpp_lib_scoped_lock
-#   error "__cpp_lib_scoped_lock should be defined in c++23"
-# endif
-# if __cpp_lib_scoped_lock != 201703L
-#   error "__cpp_lib_scoped_lock should have the value 201703L in c++23"
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   ifndef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should be defined in c++23"
+#   endif
+#   if __cpp_lib_scoped_lock != 201703L
+#     error "__cpp_lib_scoped_lock should have the value 201703L in c++23"
+#   endif
+# else
+#   ifdef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
+#   endif
 # endif
 
 #elif TEST_STD_VER > 23
 
-# ifndef __cpp_lib_scoped_lock
-#   error "__cpp_lib_scoped_lock should be defined in c++26"
-# endif
-# if __cpp_lib_scoped_lock != 201703L
-#   error "__cpp_lib_scoped_lock should have the value 201703L in c++26"
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   ifndef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should be defined in c++26"
+#   endif
+#   if __cpp_lib_scoped_lock != 201703L
+#     error "__cpp_lib_scoped_lock should have the value 201703L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
+#   endif
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index c319940fe6e49c..41eb7f560213f8 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -1571,11 +1571,17 @@
 #   error "__cpp_lib_print should not be defined before c++23"
 # endif
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++14"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++14"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++14"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++14"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 # ifdef __cpp_lib_ranges
@@ -2183,7 +2189,7 @@
 #   error "__cpp_lib_expected should not be defined before c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)
 #   ifndef __cpp_lib_filesystem
 #     error "__cpp_lib_filesystem should be defined in c++17"
 #   endif
@@ -2192,7 +2198,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_filesystem
-#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!"
+#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!"
 #   endif
 # endif
 
@@ -2568,11 +2574,17 @@
 #   error "__cpp_lib_print should not be defined before c++23"
 # endif
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++17"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++17"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++17"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++17"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 # ifdef __cpp_lib_ranges
@@ -2671,11 +2683,17 @@
 #   error "__cpp_lib_saturation_arithmetic should not be defined before c++26"
 # endif
 
-# ifndef __cpp_lib_scoped_lock
-#   error "__cpp_lib_scoped_lock should be defined in c++17"
-# endif
-# if __cpp_lib_scoped_lock != 201703L
-#   error "__cpp_lib_scoped_lock should have the value 201703L in c++17"
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   ifndef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should be defined in c++17"
+#   endif
+#   if __cpp_lib_scoped_lock != 201703L
+#     error "__cpp_lib_scoped_lock should have the value 201703L in c++17"
+#   endif
+# else
+#   ifdef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
+#   endif
 # endif
 
 # ifdef __cpp_lib_semaphore
@@ -3366,7 +3384,7 @@
 #   error "__cpp_lib_expected should not be defined before c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)
 #   ifndef __cpp_lib_filesystem
 #     error "__cpp_lib_filesystem should be defined in c++20"
 #   endif
@@ -3375,7 +3393,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_filesystem
-#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!"
+#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!"
 #   endif
 # endif
 
@@ -3835,11 +3853,17 @@
 #   error "__cpp_lib_print should not be defined before c++23"
 # endif
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++20"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++20"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++20"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++20"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 # ifndef __cpp_lib_ranges
@@ -3944,11 +3968,17 @@
 #   error "__cpp_lib_saturation_arithmetic should not be defined before c++26"
 # endif
 
-# ifndef __cpp_lib_scoped_lock
-#   error "__cpp_lib_scoped_lock should be defined in c++20"
-# endif
-# if __cpp_lib_scoped_lock != 201703L
-#   error "__cpp_lib_scoped_lock should have the value 201703L in c++20"
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   ifndef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should be defined in c++20"
+#   endif
+#   if __cpp_lib_scoped_lock != 201703L
+#     error "__cpp_lib_scoped_lock should have the value 201703L in c++20"
+#   endif
+# else
+#   ifdef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
+#   endif
 # endif
 
 # if !defined(_LIBCPP_HAS_NO_THREADS) && (!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_SYNC)
@@ -4750,7 +4780,7 @@
 #   error "__cpp_lib_expected should have the value 202211L in c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)
 #   ifndef __cpp_lib_filesystem
 #     error "__cpp_lib_filesystem should be defined in c++23"
 #   endif
@@ -4759,7 +4789,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_filesystem
-#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!"
+#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!"
 #   endif
 # endif
 
@@ -5267,11 +5297,17 @@
 #   error "__cpp_lib_print should have the value 202207L in c++23"
 # endif
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++23"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++23"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++23"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++23"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 # ifndef __cpp_lib_ranges
@@ -5460,11 +5496,17 @@
 #   error "__cpp_lib_saturation_arithmetic should not be defined before c++26"
 # endif
 
-# ifndef __cpp_lib_scoped_lock
-#   error "__cpp_lib_scoped_lock should be defined in c++23"
-# endif
-# if __cpp_lib_scoped_lock != 201703L
-#   error "__cpp_lib_scoped_lock should have the value 201703L in c++23"
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   ifndef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should be defined in c++23"
+#   endif
+#   if __cpp_lib_scoped_lock != 201703L
+#     error "__cpp_lib_scoped_lock should have the value 201703L in c++23"
+#   endif
+# else
+#   ifdef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
+#   endif
 # endif
 
 # if !defined(_LIBCPP_HAS_NO_THREADS) && (!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_SYNC)
@@ -6347,7 +6389,7 @@
 #   error "__cpp_lib_expected should have the value 202211L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)
 #   ifndef __cpp_lib_filesystem
 #     error "__cpp_lib_filesystem should be defined in c++26"
 #   endif
@@ -6356,7 +6398,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_filesystem
-#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY' is not met!"
+#     error "__cpp_lib_filesystem should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)' is not met!"
 #   endif
 # endif
 
@@ -6511,11 +6553,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_fstream_native_handle
-#   error "__cpp_lib_fstream_native_handle should be defined in c++26"
-# endif
-# if __cpp_lib_fstream_native_handle != 202306L
-#   error "__cpp_lib_fstream_native_handle should have the value 202306L in c++26"
+# if !defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION))
+#   ifndef __cpp_lib_fstream_native_handle
+#     error "__cpp_lib_fstream_native_handle should be defined in c++26"
+#   endif
+#   if __cpp_lib_fstream_native_handle != 202306L
+#     error "__cpp_lib_fstream_native_handle should have the value 202306L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_fstream_native_handle
+#     error "__cpp_lib_fstream_native_handle should not be defined when the requirement '!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION))' is not met!"
+#   endif
 # endif
 
 # if !defined(_LIBCPP_VERSION)
@@ -6966,11 +7014,17 @@
 #   error "__cpp_lib_print should have the value 202207L in c++26"
 # endif
 
-# ifndef __cpp_lib_quoted_string_io
-#   error "__cpp_lib_quoted_string_io should be defined in c++26"
-# endif
-# if __cpp_lib_quoted_string_io != 201304L
-#   error "__cpp_lib_quoted_string_io should have the value 201304L in c++26"
+# if !defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   ifndef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should be defined in c++26"
+#   endif
+#   if __cpp_lib_quoted_string_io != 201304L
+#     error "__cpp_lib_quoted_string_io should have the value 201304L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_quoted_string_io
+#     error "__cpp_lib_quoted_string_io should not be defined when the requirement '!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)' is not met!"
+#   endif
 # endif
 
 # ifndef __cpp_lib_ranges
@@ -7174,11 +7228,17 @@
 #   error "__cpp_lib_saturation_arithmetic should have the value 202311L in c++26"
 # endif
 
-# ifndef __cpp_lib_scoped_lock
-#   error "__cpp_lib_scoped_lock should be defined in c++26"
-# endif
-# if __cpp_lib_scoped_lock != 201703L
-#   error "__cpp_lib_scoped_lock should have the value 201703L in c++26"
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   ifndef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should be defined in c++26"
+#   endif
+#   if __cpp_lib_scoped_lock != 201703L
+#     error "__cpp_lib_scoped_lock should have the value 201703L in c++26"
+#   endif
+# else
+#   ifdef __cpp_lib_scoped_lock
+#     error "__cpp_lib_scoped_lock should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
+#   endif
 # endif
 
 # if !defined(_LIBCPP_HAS_NO_THREADS) && (!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_SYNC)
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp
index 52dbde45dbb265..32ce1c8bf617dc 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp
@@ -7,6 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-exceptions
+
+// After changing the alignment of the allocated pointer from 16 to 8, the exception thrown is no longer `bad_alloc`
+// but instead length_error on systems using new headers but older dylibs.
+//
+// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.13|10.15|11.0}}
+
 // <string>
 
 // size_type max_size() const; // constexpr since C++20
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp
index 30fdb19fd3aebe..3b2d093eb34d49 100644
--- a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp
@@ -121,26 +121,7 @@ int main(int, char**)
         test_pair_rv<CopyOnly, CopyOnly&>();
         test_pair_rv<CopyOnly, CopyOnly&&>();
 
-        /* For ExplicitTypes::CopyOnly, two of the viable candidates for initializing from a non-const xvalue are:
-         *   pair(const pair&);  // (defaulted copy constructor)
-         *   template<class U1, class U2> explicit pair(const pair<U1, U2>&&); [U1 = ExplicitTypes::CopyOnly, U2 = int]
-         * This results in diverging behavior for test_convertible which uses copy-list-initialization
-         * Prior to CWG2137, this would have selected the first (non-explicit) ctor as explicit ctors would not be considered
-         * Afterwards, it should select the second since it is a better match, and then failed because it is explicit
-         *
-         * This may change with future defect reports, and some compilers only have partial support for CWG2137,
-         * so use std::is_convertible directly to avoid a copy-list-initialization
-         */
-        {
-          using P1  = std::pair<ExplicitTypes::CopyOnly, int>;
-          using P2  = std::pair<int, ExplicitTypes::CopyOnly>;
-          using UP1 = std::pair<ExplicitTypes::CopyOnly, int>&&;
-          using UP2 = std::pair<int, ExplicitTypes::CopyOnly>&&;
-          static_assert(std::is_constructible<P1, UP1>::value, "");
-          static_assert(std::is_convertible<P1, UP1>::value, "");
-          static_assert(std::is_constructible<P2, UP2>::value, "");
-          static_assert(std::is_convertible<P2, UP2>::value, "");
-        }
+        test_pair_rv<ExplicitTypes::CopyOnly, ExplicitTypes::CopyOnly>();
         test_pair_rv<ExplicitTypes::CopyOnly, ExplicitTypes::CopyOnly&, true, false>();
         test_pair_rv<ExplicitTypes::CopyOnly, ExplicitTypes::CopyOnly&&, true, false>();
 
diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
index 978e7095216522..a52140e2b9938a 100644
--- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
+++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
@@ -5,7 +5,7 @@
 set(LLVM_DIR_SAVE ${LLVM_DIR})
 set(Clang_DIR_SAVE ${Clang_DIR})
 
-find_package(Clang 18)
+find_package(Clang 18.1)
 if (NOT Clang_FOUND)
   find_package(Clang 17)
 endif()
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index b7d95d451f2137..9825d4c8ec1df1 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -456,8 +456,8 @@ def add_version_header(tc):
             "name": "__cpp_lib_filesystem",
             "values": {"c++17": 201703},
             "headers": ["filesystem"],
-            "test_suite_guard": "!defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY",
-            "libcxx_guard": "_LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY",
+            "test_suite_guard": "!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)",
+            "libcxx_guard": "!defined(_LIBCPP_HAS_NO_FILESYSTEM) && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY",
         },
         {
             "name": "__cpp_lib_format",
@@ -570,6 +570,8 @@ def add_version_header(tc):
             "name": "__cpp_lib_fstream_native_handle",
             "values": {"c++26": 202306},  # P1759R6 Native handles and file streams
             "headers": ["fstream"],
+            "test_suite_guard": "!defined(_LIBCPP_VERSION) || (!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION))",
+            "libcxx_guard": "!defined(_LIBCPP_HAS_NO_FILESYSTEM) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)",
         },
         {
             "name": "__cpp_lib_function_ref",
@@ -879,6 +881,8 @@ def add_version_header(tc):
             "name": "__cpp_lib_quoted_string_io",
             "values": {"c++14": 201304},
             "headers": ["iomanip"],
+            "test_suite_guard": "!defined(_LIBCPP_VERSION) || !defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+            "libcxx_guard": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
         },
         {
             "name": "__cpp_lib_ranges",
@@ -1016,6 +1020,8 @@ def add_version_header(tc):
             "name": "__cpp_lib_scoped_lock",
             "values": {"c++17": 201703},
             "headers": ["mutex"],
+            "test_suite_guard": "!defined(_LIBCPP_HAS_NO_THREADS)",
+            "libcxx_guard": "!defined(_LIBCPP_HAS_NO_THREADS)",
         },
         {
             "name": "__cpp_lib_semaphore",
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index 4198827203fc8b..f4722c3b352d4d 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -265,7 +265,10 @@ if(LIBCXXABI_HERMETIC_STATIC_LIBRARY)
   # then its code shouldn't declare them with hidden visibility.  They might
   # actually be provided by a shared library at link time.
   if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
-    target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+    target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete=force-hidden)
+    if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+      target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+    endif()
   endif()
   # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in libcxx's
   # __config_site too. Define it in the same way here, to avoid redefinition
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index bb1b052f61d875..806d5a783ec39c 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -21,6 +21,7 @@ set(LIBUNWIND_LIBCXX_PATH "${CMAKE_CURRENT_LIST_DIR}/../libcxx" CACHE PATH
         "Specify path to libc++ source.")
 
 include(GNUInstallDirs)
+include(CheckSymbolExists)
 
 #===============================================================================
 # Setup CMake Options
@@ -96,6 +97,20 @@ endif()
 option(LIBUNWIND_HIDE_SYMBOLS
   "Do not export any symbols from the static library." ${LIBUNWIND_DEFAULT_HIDE_SYMBOLS})
 
+# If toolchain is FPXX, we switch to FP64 to save the full FPRs. See:
+# https://web.archive.org/web/20180828210612/https://dmz-portal.mips.com/wiki/MIPS_O32_ABI_-_FR0_and_FR1_Interlinking
+check_symbol_exists(__mips_hard_float "" __MIPSHF)
+check_symbol_exists(_ABIO32 "" __MIPS_O32)
+if (__MIPSHF AND __MIPS_O32)
+  file(WRITE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/mips_is_fpxx.c
+    "#if __mips_fpr != 0\n"
+    "# error\n"
+    "#endif\n")
+  try_compile(MIPS_FPABI_FPXX ${CMAKE_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/mips_is_fpxx.c
+    CMAKE_FLAGS -DCMAKE_C_LINK_EXECUTABLE='echo')
+endif()
+
 #===============================================================================
 # Configure System
 #===============================================================================
@@ -179,6 +194,10 @@ if (WIN32)
   add_compile_flags_if_supported(-Wno-dll-attribute-on-redeclaration)
 endif()
 
+if (MIPS_FPABI_FPXX)
+  add_compile_flags(-mfp64)
+endif()
+
 # Get feature flags.
 # Exceptions
 # Catches C++ exceptions only and tells the compiler to assume that extern C
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index 9c6f5d908b0945..780430ba70ba60 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -201,7 +201,10 @@ set_target_properties(unwind_static_objects
 
 if(LIBUNWIND_HIDE_SYMBOLS)
   target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility=hidden)
-  target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+  target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete=force-hidden)
+  if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+    target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+  endif()
   target_compile_definitions(unwind_static_objects PRIVATE _LIBUNWIND_HIDE_SYMBOLS)
 endif()
 
diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 39f4575031be54..e2074932bc466e 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -652,6 +652,13 @@ void SectionChunk::getRuntimePseudoRelocs(
         dyn_cast_or_null<Defined>(file->getSymbol(rel.SymbolTableIndex));
     if (!target || !target->isRuntimePseudoReloc)
       continue;
+    // If the target doesn't have a chunk allocated, it may be a
+    // DefinedImportData symbol which ended up unnecessary after GC.
+    // Normally we wouldn't eliminate section chunks that are referenced, but
+    // references within DWARF sections don't count for keeping section chunks
+    // alive. Thus such dangling references in DWARF sections are expected.
+    if (!target->getChunk())
+      continue;
     int sizeInBits =
         getRuntimePseudoRelocSize(rel.Type, file->ctx.config.machine);
     if (sizeInBits == 0) {
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 6b516d8c6d5ef8..c4388ba9e40d0b 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -172,7 +172,7 @@ binImports(COFFLinkerContext &ctx,
 // A chunk for the delay import descriptor table etnry.
 class DelayDirectoryChunk : public NonSectionChunk {
 public:
-  explicit DelayDirectoryChunk(Chunk *n) : dllName(n) {}
+  explicit DelayDirectoryChunk(Chunk *n) : dllName(n) { setAlignment(4); }
 
   size_t getSize() const override {
     return sizeof(delay_import_directory_table_entry);
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index e0afb6b18805b2..22ee2f133be98a 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1825,7 +1825,15 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     }
   } else {
     config->repro = false;
-    config->timestamp = time(nullptr);
+    if (std::optional<std::string> epoch =
+            Process::GetEnv("SOURCE_DATE_EPOCH")) {
+      StringRef value(*epoch);
+      if (value.getAsInteger(0, config->timestamp))
+        fatal(Twine("invalid SOURCE_DATE_EPOCH timestamp: ") + value +
+              ".  Expected 32-bit integer");
+    } else {
+      config->timestamp = time(nullptr);
+    }
   }
 
   // Handle /alternatename
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index ab2ec5b447d000..464f5dfb320ccc 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -11,6 +11,7 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
+#include "llvm/Support/LEB128.h"
 
 using namespace llvm;
 using namespace llvm::object;
@@ -36,9 +37,12 @@ class LoongArch final : public TargetInfo {
   bool usesOnlyLowPageBits(RelType type) const override;
   void relocate(uint8_t *loc, const Relocation &rel,
                 uint64_t val) const override;
+  bool relaxOnce(int pass) const override;
+  void finalizeRelax(int passes) const override;
 };
 } // end anonymous namespace
 
+namespace {
 enum Op {
   SUB_W = 0x00110000,
   SUB_D = 0x00118000,
@@ -63,6 +67,7 @@ enum Reg {
   R_T2 = 14,
   R_T3 = 15,
 };
+} // namespace
 
 // Mask out the input's lowest 12 bits for use with `pcalau12i`, in sequences
 // like `pcalau12i + addi.[wd]` or `pcalau12i + {ld,st}.*` where the `pcalau12i`
@@ -151,6 +156,17 @@ static bool isJirl(uint32_t insn) {
   return (insn & 0xfc000000) == JIRL;
 }
 
+static void handleUleb128(uint8_t *loc, uint64_t val) {
+  const uint32_t maxcount = 1 + 64 / 7;
+  uint32_t count;
+  const char *error = nullptr;
+  uint64_t orig = decodeULEB128(loc, &count, nullptr, &error);
+  if (count > maxcount || (count == maxcount && error))
+    errorOrWarn(getErrorLocation(loc) + "extra space for uleb128");
+  uint64_t mask = count < maxcount ? (1ULL << 7 * count) - 1 : -1ULL;
+  encodeULEB128((orig + val) & mask, loc, count);
+}
+
 LoongArch::LoongArch() {
   // The LoongArch ISA itself does not have a limit on page sizes. According to
   // the ISA manual, the PS (page size) field in MTLB entries and CSR.STLBPS is
@@ -392,11 +408,13 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_ADD16:
   case R_LARCH_ADD32:
   case R_LARCH_ADD64:
+  case R_LARCH_ADD_ULEB128:
   case R_LARCH_SUB6:
   case R_LARCH_SUB8:
   case R_LARCH_SUB16:
   case R_LARCH_SUB32:
   case R_LARCH_SUB64:
+  case R_LARCH_SUB_ULEB128:
     // The LoongArch add/sub relocs behave like the RISCV counterparts; reuse
     // the RelExpr to avoid code duplication.
     return R_RISCV_ADD;
@@ -465,8 +483,9 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_TLS_GD_HI20:
     return R_TLSGD_GOT;
   case R_LARCH_RELAX:
-    // LoongArch linker relaxation is not implemented yet.
-    return R_NONE;
+    return config->relax ? R_RELAX_HINT : R_NONE;
+  case R_LARCH_ALIGN:
+    return R_RELAX_HINT;
 
   // Other known relocs that are explicitly unimplemented:
   //
@@ -630,6 +649,9 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
   case R_LARCH_ADD64:
     write64le(loc, read64le(loc) + val);
     return;
+  case R_LARCH_ADD_ULEB128:
+    handleUleb128(loc, val);
+    return;
   case R_LARCH_SUB6:
     *loc = (*loc & 0xc0) | ((*loc - val) & 0x3f);
     return;
@@ -645,6 +667,9 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
   case R_LARCH_SUB64:
     write64le(loc, read64le(loc) - val);
     return;
+  case R_LARCH_SUB_ULEB128:
+    handleUleb128(loc, -val);
+    return;
 
   case R_LARCH_MARK_LA:
   case R_LARCH_MARK_PCREL:
@@ -659,6 +684,155 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
   }
 }
 
+static bool relax(InputSection &sec) {
+  const uint64_t secAddr = sec.getVA();
+  const MutableArrayRef<Relocation> relocs = sec.relocs();
+  auto &aux = *sec.relaxAux;
+  bool changed = false;
+  ArrayRef<SymbolAnchor> sa = ArrayRef(aux.anchors);
+  uint64_t delta = 0;
+
+  std::fill_n(aux.relocTypes.get(), relocs.size(), R_LARCH_NONE);
+  aux.writes.clear();
+  for (auto [i, r] : llvm::enumerate(relocs)) {
+    const uint64_t loc = secAddr + r.offset - delta;
+    uint32_t &cur = aux.relocDeltas[i], remove = 0;
+    switch (r.type) {
+    case R_LARCH_ALIGN: {
+      const uint64_t addend =
+          r.sym->isUndefined() ? Log2_64(r.addend) + 1 : r.addend;
+      const uint64_t allBytes = (1 << (addend & 0xff)) - 4;
+      const uint64_t align = 1 << (addend & 0xff);
+      const uint64_t maxBytes = addend >> 8;
+      const uint64_t off = loc & (align - 1);
+      const uint64_t curBytes = off == 0 ? 0 : align - off;
+      // All bytes beyond the alignment boundary should be removed.
+      // If emit bytes more than max bytes to emit, remove all.
+      if (maxBytes != 0 && curBytes > maxBytes)
+        remove = allBytes;
+      else
+        remove = allBytes - curBytes;
+      // If we can't satisfy this alignment, we've found a bad input.
+      if (LLVM_UNLIKELY(static_cast<int32_t>(remove) < 0)) {
+        errorOrWarn(getErrorLocation((const uint8_t *)loc) +
+                    "insufficient padding bytes for " + lld::toString(r.type) +
+                    ": " + Twine(allBytes) + " bytes available for " +
+                    "requested alignment of " + Twine(align) + " bytes");
+        remove = 0;
+      }
+      break;
+    }
+    }
+
+    // For all anchors whose offsets are <= r.offset, they are preceded by
+    // the previous relocation whose `relocDeltas` value equals `delta`.
+    // Decrease their st_value and update their st_size.
+    for (; sa.size() && sa[0].offset <= r.offset; sa = sa.slice(1)) {
+      if (sa[0].end)
+        sa[0].d->size = sa[0].offset - delta - sa[0].d->value;
+      else
+        sa[0].d->value = sa[0].offset - delta;
+    }
+    delta += remove;
+    if (delta != cur) {
+      cur = delta;
+      changed = true;
+    }
+  }
+
+  for (const SymbolAnchor &a : sa) {
+    if (a.end)
+      a.d->size = a.offset - delta - a.d->value;
+    else
+      a.d->value = a.offset - delta;
+  }
+  // Inform assignAddresses that the size has changed.
+  if (!isUInt<32>(delta))
+    fatal("section size decrease is too large: " + Twine(delta));
+  sec.bytesDropped = delta;
+  return changed;
+}
+
+// When relaxing just R_LARCH_ALIGN, relocDeltas is usually changed only once in
+// the absence of a linker script. For call and load/store R_LARCH_RELAX, code
+// shrinkage may reduce displacement and make more relocations eligible for
+// relaxation. Code shrinkage may increase displacement to a call/load/store
+// target at a higher fixed address, invalidating an earlier relaxation. Any
+// change in section sizes can have cascading effect and require another
+// relaxation pass.
+bool LoongArch::relaxOnce(int pass) const {
+  if (config->relocatable)
+    return false;
+
+  if (pass == 0)
+    initSymbolAnchors();
+
+  SmallVector<InputSection *, 0> storage;
+  bool changed = false;
+  for (OutputSection *osec : outputSections) {
+    if (!(osec->flags & SHF_EXECINSTR))
+      continue;
+    for (InputSection *sec : getInputSections(*osec, storage))
+      changed |= relax(*sec);
+  }
+  return changed;
+}
+
+void LoongArch::finalizeRelax(int passes) const {
+  log("relaxation passes: " + Twine(passes));
+  SmallVector<InputSection *, 0> storage;
+  for (OutputSection *osec : outputSections) {
+    if (!(osec->flags & SHF_EXECINSTR))
+      continue;
+    for (InputSection *sec : getInputSections(*osec, storage)) {
+      RelaxAux &aux = *sec->relaxAux;
+      if (!aux.relocDeltas)
+        continue;
+
+      MutableArrayRef<Relocation> rels = sec->relocs();
+      ArrayRef<uint8_t> old = sec->content();
+      size_t newSize = old.size() - aux.relocDeltas[rels.size() - 1];
+      uint8_t *p = context().bAlloc.Allocate<uint8_t>(newSize);
+      uint64_t offset = 0;
+      int64_t delta = 0;
+      sec->content_ = p;
+      sec->size = newSize;
+      sec->bytesDropped = 0;
+
+      // Update section content: remove NOPs for R_LARCH_ALIGN and rewrite
+      // instructions for relaxed relocations.
+      for (size_t i = 0, e = rels.size(); i != e; ++i) {
+        uint32_t remove = aux.relocDeltas[i] - delta;
+        delta = aux.relocDeltas[i];
+        if (remove == 0 && aux.relocTypes[i] == R_LARCH_NONE)
+          continue;
+
+        // Copy from last location to the current relocated location.
+        const Relocation &r = rels[i];
+        uint64_t size = r.offset - offset;
+        memcpy(p, old.data() + offset, size);
+        p += size;
+        offset = r.offset + remove;
+      }
+      memcpy(p, old.data() + offset, old.size() - offset);
+
+      // Subtract the previous relocDeltas value from the relocation offset.
+      // For a pair of R_LARCH_XXX/R_LARCH_RELAX with the same offset, decrease
+      // their r_offset by the same delta.
+      delta = 0;
+      for (size_t i = 0, e = rels.size(); i != e;) {
+        uint64_t cur = rels[i].offset;
+        do {
+          rels[i].offset -= delta;
+          if (aux.relocTypes[i] != R_LARCH_NONE)
+            rels[i].type = aux.relocTypes[i];
+        } while (++i != e && rels[i].offset == cur);
+        delta = aux.relocDeltas[i - 1];
+      }
+    }
+  }
+}
+
 TargetInfo *elf::getLoongArchTargetInfo() {
   static LoongArch target;
   return &target;
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index de52f6a79a40b9..019c073bd541b6 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -26,6 +26,7 @@ using namespace lld::elf;
 constexpr uint64_t ppc64TocOffset = 0x8000;
 constexpr uint64_t dynamicThreadPointerOffset = 0x8000;
 
+namespace {
 // The instruction encoding of bits 21-30 from the ISA for the Xform and Dform
 // instructions that can be used as part of the initial exec TLS sequence.
 enum XFormOpcd {
@@ -139,6 +140,7 @@ enum class PPCPrefixedInsn : uint64_t {
   PSTXV = PREFIX_8LS | 0xd8000000,
   PSTXVP = PREFIX_8LS | 0xf8000000
 };
+
 static bool checkPPCLegacyInsn(uint32_t encoding) {
   PPCLegacyInsn insn = static_cast<PPCLegacyInsn>(encoding);
   if (insn == PPCLegacyInsn::NOINSN)
@@ -164,7 +166,6 @@ enum class LegacyToPrefixMask : uint64_t {
       0x8000000003e00000, // S/T (6-10) - The [S/T]X bit moves from 28 to 5.
 };
 
-namespace {
 class PPC64 final : public TargetInfo {
 public:
   PPC64();
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index d7d3d3e4781497..4798c86f7d1b61 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -45,6 +45,7 @@ class RISCV final : public TargetInfo {
                 uint64_t val) const override;
   void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override;
   bool relaxOnce(int pass) const override;
+  void finalizeRelax(int passes) const override;
 };
 
 } // end anonymous namespace
@@ -56,11 +57,13 @@ class RISCV final : public TargetInfo {
 
 const uint64_t dtpOffset = 0x800;
 
+namespace {
 enum Op {
   ADDI = 0x13,
   AUIPC = 0x17,
   JALR = 0x67,
   LD = 0x3003,
+  LUI = 0x37,
   LW = 0x2003,
   SRLI = 0x5013,
   SUB = 0x40000033,
@@ -73,8 +76,10 @@ enum Reg {
   X_T0 = 5,
   X_T1 = 6,
   X_T2 = 7,
+  X_A0 = 10,
   X_T3 = 28,
 };
+} // namespace
 
 static uint32_t hi20(uint32_t val) { return (val + 0x800) >> 12; }
 static uint32_t lo12(uint32_t val) { return val & 4095; }
@@ -119,6 +124,7 @@ RISCV::RISCV() {
     tlsGotRel = R_RISCV_TLS_TPREL32;
   }
   gotRel = symbolicRel;
+  tlsDescRel = R_RISCV_TLSDESC;
 
   // .got[0] = _DYNAMIC
   gotHeaderEntriesNum = 1;
@@ -187,6 +193,8 @@ int64_t RISCV::getImplicitAddend(const uint8_t *buf, RelType type) const {
   case R_RISCV_JUMP_SLOT:
     // These relocations are defined as not having an implicit addend.
     return 0;
+  case R_RISCV_TLSDESC:
+    return config->is64 ? read64le(buf + 8) : read32le(buf + 4);
   }
 }
 
@@ -295,6 +303,12 @@ RelExpr RISCV::getRelExpr(const RelType type, const Symbol &s,
   case R_RISCV_PCREL_LO12_I:
   case R_RISCV_PCREL_LO12_S:
     return R_RISCV_PC_INDIRECT;
+  case R_RISCV_TLSDESC_HI20:
+  case R_RISCV_TLSDESC_LOAD_LO12:
+  case R_RISCV_TLSDESC_ADD_LO12:
+    return R_TLSDESC_PC;
+  case R_RISCV_TLSDESC_CALL:
+    return R_TLSDESC_CALL;
   case R_RISCV_TLS_GD_HI20:
     return R_TLSGD_PC;
   case R_RISCV_TLS_GOT_HI20:
@@ -419,6 +433,7 @@ void RISCV::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
 
   case R_RISCV_GOT_HI20:
   case R_RISCV_PCREL_HI20:
+  case R_RISCV_TLSDESC_HI20:
   case R_RISCV_TLS_GD_HI20:
   case R_RISCV_TLS_GOT_HI20:
   case R_RISCV_TPREL_HI20:
@@ -430,6 +445,8 @@ void RISCV::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
   }
 
   case R_RISCV_PCREL_LO12_I:
+  case R_RISCV_TLSDESC_LOAD_LO12:
+  case R_RISCV_TLSDESC_ADD_LO12:
   case R_RISCV_TPREL_LO12_I:
   case R_RISCV_LO12_I: {
     uint64_t hi = (val + 0x800) >> 12;
@@ -513,32 +530,133 @@ void RISCV::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     break;
 
   case R_RISCV_RELAX:
-    return; // Ignored (for now)
-
+    return;
+  case R_RISCV_TLSDESC:
+    // The addend is stored in the second word.
+    if (config->is64)
+      write64le(loc + 8, val);
+    else
+      write32le(loc + 4, val);
+    break;
   default:
     llvm_unreachable("unknown relocation");
   }
 }
 
+static bool relaxable(ArrayRef<Relocation> relocs, size_t i) {
+  return i + 1 != relocs.size() && relocs[i + 1].type == R_RISCV_RELAX;
+}
+
+static void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) {
+  switch (rel.type) {
+  case R_RISCV_TLSDESC_HI20:
+  case R_RISCV_TLSDESC_LOAD_LO12:
+    write32le(loc, 0x00000013); // nop
+    break;
+  case R_RISCV_TLSDESC_ADD_LO12:
+    write32le(loc, utype(AUIPC, X_A0, hi20(val))); // auipc a0,<hi20>
+    break;
+  case R_RISCV_TLSDESC_CALL:
+    if (config->is64)
+      write32le(loc, itype(LD, X_A0, X_A0, lo12(val))); // ld a0,<lo12>(a0)
+    else
+      write32le(loc, itype(LW, X_A0, X_A0, lo12(val))); // lw a0,<lo12>(a0)
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLSDESC to IE");
+  }
+}
+
+static void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
+  switch (rel.type) {
+  case R_RISCV_TLSDESC_HI20:
+  case R_RISCV_TLSDESC_LOAD_LO12:
+    write32le(loc, 0x00000013); // nop
+    return;
+  case R_RISCV_TLSDESC_ADD_LO12:
+    if (isInt<12>(val))
+      write32le(loc, 0x00000013); // nop
+    else
+      write32le(loc, utype(LUI, X_A0, hi20(val))); // lui a0,<hi20>
+    return;
+  case R_RISCV_TLSDESC_CALL:
+    if (isInt<12>(val))
+      write32le(loc, itype(ADDI, X_A0, 0, val)); // addi a0,zero,<lo12>
+    else
+      write32le(loc, itype(ADDI, X_A0, X_A0, lo12(val))); // addi a0,a0,<lo12>
+    return;
+  default:
+    llvm_unreachable("unsupported relocation for TLSDESC to LE");
+  }
+}
+
 void RISCV::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   uint64_t secAddr = sec.getOutputSection()->addr;
   if (auto *s = dyn_cast<InputSection>(&sec))
     secAddr += s->outSecOff;
   else if (auto *ehIn = dyn_cast<EhInputSection>(&sec))
     secAddr += ehIn->getParent()->outSecOff;
-  for (size_t i = 0, size = sec.relocs().size(); i != size; ++i) {
-    const Relocation &rel = sec.relocs()[i];
+  uint64_t tlsdescVal = 0;
+  bool tlsdescRelax = false, isToLe = false;
+  const ArrayRef<Relocation> relocs = sec.relocs();
+  for (size_t i = 0, size = relocs.size(); i != size; ++i) {
+    const Relocation &rel = relocs[i];
     uint8_t *loc = buf + rel.offset;
-    const uint64_t val =
+    uint64_t val =
         sec.getRelocTargetVA(sec.file, rel.type, rel.addend,
                              secAddr + rel.offset, *rel.sym, rel.expr);
 
     switch (rel.expr) {
     case R_RELAX_HINT:
+      continue;
+    case R_TLSDESC_PC:
+      // For R_RISCV_TLSDESC_HI20, store &got(sym)-PC to be used by the
+      // following two instructions L[DW] and ADDI.
+      if (rel.type == R_RISCV_TLSDESC_HI20)
+        tlsdescVal = val;
+      else
+        val = tlsdescVal;
       break;
+    case R_RELAX_TLS_GD_TO_IE:
+      // Only R_RISCV_TLSDESC_HI20 reaches here. tlsdescVal will be finalized
+      // after we see R_RISCV_TLSDESC_ADD_LO12 in the R_RELAX_TLS_GD_TO_LE case.
+      // The net effect is that tlsdescVal will be smaller than `val` to take
+      // into account of NOP instructions (in the absence of R_RISCV_RELAX)
+      // before AUIPC.
+      tlsdescVal = val + rel.offset;
+      isToLe = false;
+      tlsdescRelax = relaxable(relocs, i);
+      if (!tlsdescRelax)
+        tlsdescToIe(loc, rel, val);
+      continue;
+    case R_RELAX_TLS_GD_TO_LE:
+      // See the comment in handleTlsRelocation. For TLSDESC=>IE,
+      // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12,CALL} also reach here. If isToIe is
+      // true, this is actually TLSDESC=>IE optimization.
+      if (rel.type == R_RISCV_TLSDESC_HI20) {
+        tlsdescVal = val;
+        isToLe = true;
+        tlsdescRelax = relaxable(relocs, i);
+      } else {
+        if (!isToLe && rel.type == R_RISCV_TLSDESC_ADD_LO12)
+          tlsdescVal -= rel.offset;
+        val = tlsdescVal;
+      }
+      // When NOP conversion is eligible and relaxation applies, don't write a
+      // NOP in case an unrelated instruction follows the current instruction.
+      if (tlsdescRelax &&
+          (rel.type == R_RISCV_TLSDESC_HI20 ||
+           rel.type == R_RISCV_TLSDESC_LOAD_LO12 ||
+           (rel.type == R_RISCV_TLSDESC_ADD_LO12 && isToLe && !hi20(val))))
+        continue;
+      if (isToLe)
+        tlsdescToLe(loc, rel, val);
+      else
+        tlsdescToIe(loc, rel, val);
+      continue;
     case R_RISCV_LEB128:
       if (i + 1 < size) {
-        const Relocation &rel1 = sec.relocs()[i + 1];
+        const Relocation &rel1 = relocs[i + 1];
         if (rel.type == R_RISCV_SET_ULEB128 &&
             rel1.type == R_RISCV_SUB_ULEB128 && rel.offset == rel1.offset) {
           auto val = rel.sym->getVA(rel.addend) - rel1.sym->getVA(rel1.addend);
@@ -554,39 +672,19 @@ void RISCV::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
                   ": R_RISCV_SET_ULEB128 not paired with R_RISCV_SUB_SET128");
       return;
     default:
-      relocate(loc, rel, val);
       break;
     }
+    relocate(loc, rel, val);
   }
 }
 
-namespace {
-struct SymbolAnchor {
-  uint64_t offset;
-  Defined *d;
-  bool end; // true for the anchor of st_value+st_size
-};
-} // namespace
-
-struct elf::RISCVRelaxAux {
-  // This records symbol start and end offsets which will be adjusted according
-  // to the nearest relocDeltas element.
-  SmallVector<SymbolAnchor, 0> anchors;
-  // For relocations[i], the actual offset is r_offset - (i ? relocDeltas[i-1] :
-  // 0).
-  std::unique_ptr<uint32_t[]> relocDeltas;
-  // For relocations[i], the actual type is relocTypes[i].
-  std::unique_ptr<RelType[]> relocTypes;
-  SmallVector<uint32_t, 0> writes;
-};
-
-static void initSymbolAnchors() {
+void elf::initSymbolAnchors() {
   SmallVector<InputSection *, 0> storage;
   for (OutputSection *osec : outputSections) {
     if (!(osec->flags & SHF_EXECINSTR))
       continue;
     for (InputSection *sec : getInputSections(*osec, storage)) {
-      sec->relaxAux = make<RISCVRelaxAux>();
+      sec->relaxAux = make<RelaxAux>();
       if (sec->relocs().size()) {
         sec->relaxAux->relocDeltas =
             std::make_unique<uint32_t[]>(sec->relocs().size());
@@ -715,14 +813,16 @@ static void relaxHi20Lo12(const InputSection &sec, size_t i, uint64_t loc,
 
 static bool relax(InputSection &sec) {
   const uint64_t secAddr = sec.getVA();
+  const MutableArrayRef<Relocation> relocs = sec.relocs();
   auto &aux = *sec.relaxAux;
   bool changed = false;
   ArrayRef<SymbolAnchor> sa = ArrayRef(aux.anchors);
   uint64_t delta = 0;
+  bool tlsdescRelax = false, toLeShortForm = false;
 
-  std::fill_n(aux.relocTypes.get(), sec.relocs().size(), R_RISCV_NONE);
+  std::fill_n(aux.relocTypes.get(), relocs.size(), R_RISCV_NONE);
   aux.writes.clear();
-  for (auto [i, r] : llvm::enumerate(sec.relocs())) {
+  for (auto [i, r] : llvm::enumerate(relocs)) {
     const uint64_t loc = secAddr + r.offset - delta;
     uint32_t &cur = aux.relocDeltas[i], remove = 0;
     switch (r.type) {
@@ -743,25 +843,37 @@ static bool relax(InputSection &sec) {
     }
     case R_RISCV_CALL:
     case R_RISCV_CALL_PLT:
-      if (i + 1 != sec.relocs().size() &&
-          sec.relocs()[i + 1].type == R_RISCV_RELAX)
+      if (relaxable(relocs, i))
         relaxCall(sec, i, loc, r, remove);
       break;
     case R_RISCV_TPREL_HI20:
     case R_RISCV_TPREL_ADD:
     case R_RISCV_TPREL_LO12_I:
     case R_RISCV_TPREL_LO12_S:
-      if (i + 1 != sec.relocs().size() &&
-          sec.relocs()[i + 1].type == R_RISCV_RELAX)
+      if (relaxable(relocs, i))
         relaxTlsLe(sec, i, loc, r, remove);
       break;
     case R_RISCV_HI20:
     case R_RISCV_LO12_I:
     case R_RISCV_LO12_S:
-      if (i + 1 != sec.relocs().size() &&
-          sec.relocs()[i + 1].type == R_RISCV_RELAX)
+      if (relaxable(relocs, i))
         relaxHi20Lo12(sec, i, loc, r, remove);
       break;
+    case R_RISCV_TLSDESC_HI20:
+      // For TLSDESC=>LE, we can use the short form if hi20 is zero.
+      tlsdescRelax = relaxable(relocs, i);
+      toLeShortForm = tlsdescRelax && r.expr == R_RELAX_TLS_GD_TO_LE &&
+                      !hi20(r.sym->getVA(r.addend));
+      [[fallthrough]];
+    case R_RISCV_TLSDESC_LOAD_LO12:
+      // For TLSDESC=>LE/IE, AUIPC and L[DW] are removed if relaxable.
+      if (tlsdescRelax && r.expr != R_TLSDESC_PC)
+        remove = 4;
+      break;
+    case R_RISCV_TLSDESC_ADD_LO12:
+      if (toLeShortForm)
+        remove = 4;
+      break;
     }
 
     // For all anchors whose offsets are <= r.offset, they are preceded by
@@ -819,7 +931,7 @@ bool RISCV::relaxOnce(int pass) const {
   return changed;
 }
 
-void elf::riscvFinalizeRelax(int passes) {
+void RISCV::finalizeRelax(int passes) const {
   llvm::TimeTraceScope timeScope("Finalize RISC-V relaxation");
   log("relaxation passes: " + Twine(passes));
   SmallVector<InputSection *, 0> storage;
@@ -827,7 +939,7 @@ void elf::riscvFinalizeRelax(int passes) {
     if (!(osec->flags & SHF_EXECINSTR))
       continue;
     for (InputSection *sec : getInputSections(*osec, storage)) {
-      RISCVRelaxAux &aux = *sec->relaxAux;
+      RelaxAux &aux = *sec->relaxAux;
       if (!aux.relocDeltas)
         continue;
 
diff --git a/lld/ELF/Arch/SystemZ.cpp b/lld/ELF/Arch/SystemZ.cpp
new file mode 100644
index 00000000000000..d37db6877559dc
--- /dev/null
+++ b/lld/ELF/Arch/SystemZ.cpp
@@ -0,0 +1,607 @@
+//===- SystemZ.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OutputSections.h"
+#include "Symbols.h"
+#include "SyntheticSections.h"
+#include "Target.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::support::endian;
+using namespace llvm::ELF;
+using namespace lld;
+using namespace lld::elf;
+
+namespace {
+class SystemZ : public TargetInfo {
+public:
+  SystemZ();
+  int getTlsGdRelaxSkip(RelType type) const override;
+  RelExpr getRelExpr(RelType type, const Symbol &s,
+                     const uint8_t *loc) const override;
+  RelType getDynRel(RelType type) const override;
+  void writeGotHeader(uint8_t *buf) const override;
+  void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
+  void writeIgotPlt(uint8_t *buf, const Symbol &s) const override;
+  void writePltHeader(uint8_t *buf) const override;
+  void addPltHeaderSymbols(InputSection &isd) const override;
+  void writePlt(uint8_t *buf, const Symbol &sym,
+                uint64_t pltEntryAddr) const override;
+  RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
+  RelExpr adjustGotPcExpr(RelType type, int64_t addend,
+                          const uint8_t *loc) const override;
+  bool relaxOnce(int pass) const override;
+  void relocate(uint8_t *loc, const Relocation &rel,
+                uint64_t val) const override;
+  int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
+
+private:
+  void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+};
+} // namespace
+
+SystemZ::SystemZ() {
+  copyRel = R_390_COPY;
+  gotRel = R_390_GLOB_DAT;
+  pltRel = R_390_JMP_SLOT;
+  relativeRel = R_390_RELATIVE;
+  iRelativeRel = R_390_IRELATIVE;
+  symbolicRel = R_390_64;
+  tlsGotRel = R_390_TLS_TPOFF;
+  tlsModuleIndexRel = R_390_TLS_DTPMOD;
+  tlsOffsetRel = R_390_TLS_DTPOFF;
+  gotHeaderEntriesNum = 3;
+  gotPltHeaderEntriesNum = 0;
+  gotEntrySize = 8;
+  pltHeaderSize = 32;
+  pltEntrySize = 32;
+  ipltEntrySize = 32;
+
+  // This "trap instruction" is used to fill gaps between sections.
+  // On SystemZ, the behavior of the GNU ld is to fill those gaps
+  // with nop instructions instead - and unfortunately the default
+  // glibc crt object files (used to) rely on that behavior since
+  // they use an alignment on the .init section fragments that causes
+  // gaps which must be filled with nops as they are being executed.
+  // Therefore, we provide a nop instruction as "trapInstr" here.
+  trapInstr = {0x07, 0x07, 0x07, 0x07};
+
+  defaultImageBase = 0x1000000;
+}
+
+RelExpr SystemZ::getRelExpr(RelType type, const Symbol &s,
+                            const uint8_t *loc) const {
+  switch (type) {
+  case R_390_NONE:
+    return R_NONE;
+  // Relocations targeting the symbol value.
+  case R_390_8:
+  case R_390_12:
+  case R_390_16:
+  case R_390_20:
+  case R_390_32:
+  case R_390_64:
+    return R_ABS;
+  case R_390_PC16:
+  case R_390_PC32:
+  case R_390_PC64:
+  case R_390_PC12DBL:
+  case R_390_PC16DBL:
+  case R_390_PC24DBL:
+  case R_390_PC32DBL:
+    return R_PC;
+  case R_390_GOTOFF16:
+  case R_390_GOTOFF: // a.k.a. R_390_GOTOFF32
+  case R_390_GOTOFF64:
+    return R_GOTREL;
+  // Relocations targeting the PLT associated with the symbol.
+  case R_390_PLT32:
+  case R_390_PLT64:
+  case R_390_PLT12DBL:
+  case R_390_PLT16DBL:
+  case R_390_PLT24DBL:
+  case R_390_PLT32DBL:
+    return R_PLT_PC;
+  case R_390_PLTOFF16:
+  case R_390_PLTOFF32:
+  case R_390_PLTOFF64:
+    return R_PLT_GOTREL;
+  // Relocations targeting the GOT entry associated with the symbol.
+  case R_390_GOTENT:
+    return R_GOT_PC;
+  case R_390_GOT12:
+  case R_390_GOT16:
+  case R_390_GOT20:
+  case R_390_GOT32:
+  case R_390_GOT64:
+    return R_GOT_OFF;
+  // Relocations targeting the GOTPLT entry associated with the symbol.
+  case R_390_GOTPLTENT:
+    return R_GOTPLT_PC;
+  case R_390_GOTPLT12:
+  case R_390_GOTPLT16:
+  case R_390_GOTPLT20:
+  case R_390_GOTPLT32:
+  case R_390_GOTPLT64:
+    return R_GOTPLT_GOTREL;
+  // Relocations targeting _GLOBAL_OFFSET_TABLE_.
+  case R_390_GOTPC:
+  case R_390_GOTPCDBL:
+    return R_GOTONLY_PC;
+  // TLS-related relocations.
+  case R_390_TLS_LOAD:
+    return R_NONE;
+  case R_390_TLS_GDCALL:
+    return R_TLSGD_PC;
+  case R_390_TLS_LDCALL:
+    return R_TLSLD_PC;
+  case R_390_TLS_GD32:
+  case R_390_TLS_GD64:
+    return R_TLSGD_GOT;
+  case R_390_TLS_LDM32:
+  case R_390_TLS_LDM64:
+    return R_TLSLD_GOT;
+  case R_390_TLS_LDO32:
+  case R_390_TLS_LDO64:
+    return R_DTPREL;
+  case R_390_TLS_LE32:
+  case R_390_TLS_LE64:
+    return R_TPREL;
+  case R_390_TLS_IE32:
+  case R_390_TLS_IE64:
+    return R_GOT;
+  case R_390_TLS_GOTIE12:
+  case R_390_TLS_GOTIE20:
+  case R_390_TLS_GOTIE32:
+  case R_390_TLS_GOTIE64:
+    return R_GOT_OFF;
+  case R_390_TLS_IEENT:
+    return R_GOT_PC;
+
+  default:
+    error(getErrorLocation(loc) + "unknown relocation (" + Twine(type) +
+          ") against symbol " + toString(s));
+    return R_NONE;
+  }
+}
+
+void SystemZ::writeGotHeader(uint8_t *buf) const {
+  // _GLOBAL_OFFSET_TABLE_[0] holds the value of _DYNAMIC.
+  // _GLOBAL_OFFSET_TABLE_[1] and [2] are reserved.
+  write64be(buf, mainPart->dynamic->getVA());
+}
+
+void SystemZ::writeGotPlt(uint8_t *buf, const Symbol &s) const {
+  write64be(buf, s.getPltVA() + 14);
+}
+
+void SystemZ::writeIgotPlt(uint8_t *buf, const Symbol &s) const {
+  if (config->writeAddends)
+    write64be(buf, s.getVA());
+}
+
+void SystemZ::writePltHeader(uint8_t *buf) const {
+  const uint8_t pltData[] = {
+      0xe3, 0x10, 0xf0, 0x38, 0x00, 0x24, // stg     %r1,56(%r15)
+      0xc0, 0x10, 0x00, 0x00, 0x00, 0x00, // larl    %r1,_GLOBAL_OFFSET_TABLE_
+      0xd2, 0x07, 0xf0, 0x30, 0x10, 0x08, // mvc     48(8,%r15),8(%r1)
+      0xe3, 0x10, 0x10, 0x10, 0x00, 0x04, // lg      %r1,16(%r1)
+      0x07, 0xf1,                         // br      %r1
+      0x07, 0x00,                         // nopr
+      0x07, 0x00,                         // nopr
+      0x07, 0x00,                         // nopr
+  };
+  memcpy(buf, pltData, sizeof(pltData));
+  uint64_t got = in.got->getVA();
+  uint64_t plt = in.plt->getVA();
+  write32be(buf + 8, (got - plt - 6) >> 1);
+}
+
+void SystemZ::addPltHeaderSymbols(InputSection &isec) const {
+  // The PLT header needs a reference to _GLOBAL_OFFSET_TABLE_, so we
+  // must ensure the .got section is created even if otherwise unused.
+  in.got->hasGotOffRel.store(true, std::memory_order_relaxed);
+}
+
+void SystemZ::writePlt(uint8_t *buf, const Symbol &sym,
+                       uint64_t pltEntryAddr) const {
+  const uint8_t inst[] = {
+      0xc0, 0x10, 0x00, 0x00, 0x00, 0x00, // larl    %r1,<.got.plt slot>
+      0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg      %r1,0(%r1)
+      0x07, 0xf1,                         // br      %r1
+      0x0d, 0x10,                         // basr    %r1,%r0
+      0xe3, 0x10, 0x10, 0x0c, 0x00, 0x14, // lgf     %r1,12(%r1)
+      0xc0, 0xf4, 0x00, 0x00, 0x00, 0x00, // jg      <plt header>
+      0x00, 0x00, 0x00, 0x00,             // <relocation offset>
+  };
+  memcpy(buf, inst, sizeof(inst));
+
+  write32be(buf + 2, (sym.getGotPltVA() - pltEntryAddr) >> 1);
+  write32be(buf + 24, (in.plt->getVA() - pltEntryAddr - 22) >> 1);
+  write32be(buf + 28, in.relaPlt->entsize * sym.getPltIdx());
+}
+
+int64_t SystemZ::getImplicitAddend(const uint8_t *buf, RelType type) const {
+  switch (type) {
+  case R_390_8:
+    return SignExtend64<8>(*buf);
+  case R_390_16:
+  case R_390_PC16:
+    return SignExtend64<16>(read16be(buf));
+  case R_390_PC16DBL:
+    return SignExtend64<16>(read16be(buf)) << 1;
+  case R_390_32:
+  case R_390_PC32:
+    return SignExtend64<32>(read32be(buf));
+  case R_390_PC32DBL:
+    return SignExtend64<32>(read32be(buf)) << 1;
+  case R_390_64:
+  case R_390_PC64:
+  case R_390_TLS_DTPMOD:
+  case R_390_TLS_DTPOFF:
+  case R_390_TLS_TPOFF:
+  case R_390_GLOB_DAT:
+  case R_390_RELATIVE:
+  case R_390_IRELATIVE:
+    return read64be(buf);
+  case R_390_COPY:
+  case R_390_JMP_SLOT:
+  case R_390_NONE:
+    // These relocations are defined as not having an implicit addend.
+    return 0;
+  default:
+    internalLinkerError(getErrorLocation(buf),
+                        "cannot read addend for relocation " + toString(type));
+    return 0;
+  }
+}
+
+RelType SystemZ::getDynRel(RelType type) const {
+  if (type == R_390_64 || type == R_390_PC64)
+    return type;
+  return R_390_NONE;
+}
+
+RelExpr SystemZ::adjustTlsExpr(RelType type, RelExpr expr) const {
+  if (expr == R_RELAX_TLS_GD_TO_IE)
+    return R_RELAX_TLS_GD_TO_IE_GOT_OFF;
+  return expr;
+}
+
+int SystemZ::getTlsGdRelaxSkip(RelType type) const {
+  // A __tls_get_offset call instruction is marked with 2 relocations:
+  //
+  //   R_390_TLS_GDCALL / R_390_TLS_LDCALL: marker relocation
+  //   R_390_PLT32DBL: __tls_get_offset
+  //
+  // After the relaxation we no longer call __tls_get_offset and should skip
+  // both relocations to not create a false dependence on __tls_get_offset
+  // being defined.
+  //
+  // Note that this mechanism only works correctly if the R_390_TLS_[GL]DCALL
+  // is seen immediately *before* the R_390_PLT32DBL.  Unfortunately, current
+  // compilers on the platform will typically generate the inverse sequence.
+  // To fix this, we sort relocations by offset in RelocationScanner::scan;
+  // this ensures the correct sequence as the R_390_TLS_[GL]DCALL applies to
+  // the first byte of the brasl instruction, while the R_390_PLT32DBL applies
+  // to its third byte (the relative displacement).
+
+  if (type == R_390_TLS_GDCALL || type == R_390_TLS_LDCALL)
+    return 2;
+  return 1;
+}
+
+void SystemZ::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
+                             uint64_t val) const {
+  // The general-dynamic code sequence for a global `x`:
+  //
+  // Instruction                      Relocation       Symbol
+  // ear %rX,%a0
+  // sllg %rX,%rX,32
+  // ear %rX,%a1
+  // larl %r12,_GLOBAL_OFFSET_TABLE_  R_390_GOTPCDBL   _GLOBAL_OFFSET_TABLE_
+  // lgrl %r2,.LC0                    R_390_PC32DBL    .LC0
+  // brasl %r14,__tls_get_offset@plt  R_390_TLS_GDCALL x
+  //            :tls_gdcall:x         R_390_PLT32DBL   __tls_get_offset
+  // la %r2,0(%r2,%rX)
+  //
+  // .LC0:
+  // .quad   x@TLSGD                  R_390_TLS_GD64   x
+  //
+  // Relaxing to initial-exec entails:
+  // 1) Replacing the call by a load from the GOT.
+  // 2) Replacing the relocation on the constant LC0 by R_390_TLS_GOTIE64.
+
+  switch (rel.type) {
+  case R_390_TLS_GDCALL:
+    // brasl %r14,__tls_get_offset@plt -> lg %r2,0(%r2,%r12)
+    write16be(loc, 0xe322);
+    write32be(loc + 2, 0xc0000004);
+    break;
+  case R_390_TLS_GD64:
+    relocateNoSym(loc, R_390_TLS_GOTIE64, val);
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLS GD to IE relaxation");
+  }
+}
+
+void SystemZ::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel,
+                             uint64_t val) const {
+  // The general-dynamic code sequence for a global `x`:
+  //
+  // Instruction                      Relocation       Symbol
+  // ear %rX,%a0
+  // sllg %rX,%rX,32
+  // ear %rX,%a1
+  // larl %r12,_GLOBAL_OFFSET_TABLE_  R_390_GOTPCDBL   _GLOBAL_OFFSET_TABLE_
+  // lgrl %r2,.LC0                    R_390_PC32DBL    .LC0
+  // brasl %r14,__tls_get_offset@plt  R_390_TLS_GDCALL x
+  //            :tls_gdcall:x         R_390_PLT32DBL   __tls_get_offset
+  // la %r2,0(%r2,%rX)
+  //
+  // .LC0:
+  // .quad   x@tlsgd                  R_390_TLS_GD64   x
+  //
+  // Relaxing to local-exec entails:
+  // 1) Replacing the call by a nop.
+  // 2) Replacing the relocation on the constant LC0 by R_390_TLS_LE64.
+
+  switch (rel.type) {
+  case R_390_TLS_GDCALL:
+    // brasl %r14,__tls_get_offset@plt -> brcl 0,.
+    write16be(loc, 0xc004);
+    write32be(loc + 2, 0x00000000);
+    break;
+  case R_390_TLS_GD64:
+    relocateNoSym(loc, R_390_TLS_LE64, val);
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
+  }
+}
+
+void SystemZ::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
+                             uint64_t val) const {
+  // The local-dynamic code sequence for a global `x`:
+  //
+  // Instruction                      Relocation       Symbol
+  // ear %rX,%a0
+  // sllg %rX,%rX,32
+  // ear %rX,%a1
+  // larl %r12,_GLOBAL_OFFSET_TABLE_  R_390_GOTPCDBL   _GLOBAL_OFFSET_TABLE_
+  // lgrl %r2,.LC0                    R_390_PC32DBL    .LC0
+  // brasl %r14,__tls_get_offset@plt  R_390_TLS_LDCALL <sym>
+  //            :tls_ldcall:<sym>     R_390_PLT32DBL   __tls_get_offset
+  // la %r2,0(%r2,%rX)
+  // lgrl %rY,.LC1                    R_390_PC32DBL    .LC1
+  // la %r2,0(%r2,%rY)
+  //
+  // .LC0:
+  // .quad   <sym>@tlsldm             R_390_TLS_LDM64  <sym>
+  // .LC1:
+  // .quad   x@dtpoff                 R_390_TLS_LDO64  x
+  //
+  // Relaxing to local-exec entails:
+  // 1) Replacing the call by a nop.
+  // 2) Replacing the constant LC0 by 0 (i.e. ignoring the relocation).
+  // 3) Replacing the relocation on the constant LC1 by R_390_TLS_LE64.
+
+  switch (rel.type) {
+  case R_390_TLS_LDCALL:
+    // brasl %r14,__tls_get_offset@plt -> brcl 0,.
+    write16be(loc, 0xc004);
+    write32be(loc + 2, 0x00000000);
+    break;
+  case R_390_TLS_LDM64:
+    break;
+  case R_390_TLS_LDO64:
+    relocateNoSym(loc, R_390_TLS_LE64, val);
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLS LD to LE relaxation");
+  }
+}
+
+RelExpr SystemZ::adjustGotPcExpr(RelType type, int64_t addend,
+                                 const uint8_t *loc) const {
+  // Only R_390_GOTENT with addend 2 can be relaxed.
+  if (!config->relax || addend != 2 || type != R_390_GOTENT)
+    return R_GOT_PC;
+  const uint16_t op = read16be(loc - 2);
+
+  // lgrl rx,sym@GOTENT -> larl rx, sym
+  // This relaxation is legal if "sym" binds locally (which was already
+  // verified by our caller) and is in-range and properly aligned for a
+  // LARL instruction.  We cannot verify the latter constraint here, so
+  // we assume it is true and revert the decision later on in relaxOnce
+  // if necessary.
+  if ((op & 0xff0f) == 0xc408)
+    return R_RELAX_GOT_PC;
+
+  return R_GOT_PC;
+}
+
+bool SystemZ::relaxOnce(int pass) const {
+  // If we decided in adjustGotPcExpr to relax a R_390_GOTENT,
+  // we need to validate the target symbol is in-range and aligned.
+  SmallVector<InputSection *, 0> storage;
+  bool changed = false;
+  for (OutputSection *osec : outputSections) {
+    if (!(osec->flags & SHF_EXECINSTR))
+      continue;
+    for (InputSection *sec : getInputSections(*osec, storage)) {
+      for (Relocation &rel : sec->relocs()) {
+        if (rel.expr != R_RELAX_GOT_PC)
+          continue;
+
+        uint64_t v = sec->getRelocTargetVA(
+            sec->file, rel.type, rel.addend,
+            sec->getOutputSection()->addr + rel.offset, *rel.sym, rel.expr);
+        if (isInt<33>(v) && !(v & 1))
+          continue;
+        if (rel.sym->auxIdx == 0) {
+          rel.sym->allocateAux();
+          addGotEntry(*rel.sym);
+          changed = true;
+        }
+        rel.expr = R_GOT_PC;
+      }
+    }
+  }
+  return changed;
+}
+
+void SystemZ::relaxGot(uint8_t *loc, const Relocation &rel,
+                       uint64_t val) const {
+  assert(isInt<33>(val) &&
+         "R_390_GOTENT should not have been relaxed if it overflows");
+  assert(!(val & 1) &&
+         "R_390_GOTENT should not have been relaxed if it is misaligned");
+  const uint16_t op = read16be(loc - 2);
+
+  // lgrl rx,sym@GOTENT -> larl rx, sym
+  if ((op & 0xff0f) == 0xc408) {
+    write16be(loc - 2, 0xc000 | (op & 0x00f0));
+    write32be(loc, val >> 1);
+  }
+}
+
+void SystemZ::relocate(uint8_t *loc, const Relocation &rel,
+                       uint64_t val) const {
+  switch (rel.expr) {
+  case R_RELAX_GOT_PC:
+    return relaxGot(loc, rel, val);
+  case R_RELAX_TLS_GD_TO_IE_GOT_OFF:
+    return relaxTlsGdToIe(loc, rel, val);
+  case R_RELAX_TLS_GD_TO_LE:
+    return relaxTlsGdToLe(loc, rel, val);
+  case R_RELAX_TLS_LD_TO_LE:
+    return relaxTlsLdToLe(loc, rel, val);
+  default:
+    break;
+  }
+  switch (rel.type) {
+  case R_390_8:
+    checkIntUInt(loc, val, 8, rel);
+    *loc = val;
+    break;
+  case R_390_12:
+  case R_390_GOT12:
+  case R_390_GOTPLT12:
+  case R_390_TLS_GOTIE12:
+    checkUInt(loc, val, 12, rel);
+    write16be(loc, (read16be(loc) & 0xF000) | val);
+    break;
+  case R_390_PC12DBL:
+  case R_390_PLT12DBL:
+    checkInt(loc, val, 13, rel);
+    checkAlignment(loc, val, 2, rel);
+    write16be(loc, (read16be(loc) & 0xF000) | ((val >> 1) & 0x0FFF));
+    break;
+  case R_390_16:
+  case R_390_GOT16:
+  case R_390_GOTPLT16:
+  case R_390_GOTOFF16:
+  case R_390_PLTOFF16:
+    checkIntUInt(loc, val, 16, rel);
+    write16be(loc, val);
+    break;
+  case R_390_PC16:
+    checkInt(loc, val, 16, rel);
+    write16be(loc, val);
+    break;
+  case R_390_PC16DBL:
+  case R_390_PLT16DBL:
+    checkInt(loc, val, 17, rel);
+    checkAlignment(loc, val, 2, rel);
+    write16be(loc, val >> 1);
+    break;
+  case R_390_20:
+  case R_390_GOT20:
+  case R_390_GOTPLT20:
+  case R_390_TLS_GOTIE20:
+    checkInt(loc, val, 20, rel);
+    write32be(loc, (read32be(loc) & 0xF00000FF) | ((val & 0xFFF) << 16) |
+                       ((val & 0xFF000) >> 4));
+    break;
+  case R_390_PC24DBL:
+  case R_390_PLT24DBL:
+    checkInt(loc, val, 25, rel);
+    checkAlignment(loc, val, 2, rel);
+    loc[0] = val >> 17;
+    loc[1] = val >> 9;
+    loc[2] = val >> 1;
+    break;
+  case R_390_32:
+  case R_390_GOT32:
+  case R_390_GOTPLT32:
+  case R_390_GOTOFF:
+  case R_390_PLTOFF32:
+  case R_390_TLS_IE32:
+  case R_390_TLS_GOTIE32:
+  case R_390_TLS_GD32:
+  case R_390_TLS_LDM32:
+  case R_390_TLS_LDO32:
+  case R_390_TLS_LE32:
+    checkIntUInt(loc, val, 32, rel);
+    write32be(loc, val);
+    break;
+  case R_390_PC32:
+  case R_390_PLT32:
+    checkInt(loc, val, 32, rel);
+    write32be(loc, val);
+    break;
+  case R_390_PC32DBL:
+  case R_390_PLT32DBL:
+  case R_390_GOTPCDBL:
+  case R_390_GOTENT:
+  case R_390_GOTPLTENT:
+  case R_390_TLS_IEENT:
+    checkInt(loc, val, 33, rel);
+    checkAlignment(loc, val, 2, rel);
+    write32be(loc, val >> 1);
+    break;
+  case R_390_64:
+  case R_390_PC64:
+  case R_390_PLT64:
+  case R_390_GOT64:
+  case R_390_GOTPLT64:
+  case R_390_GOTOFF64:
+  case R_390_PLTOFF64:
+  case R_390_GOTPC:
+  case R_390_TLS_IE64:
+  case R_390_TLS_GOTIE64:
+  case R_390_TLS_GD64:
+  case R_390_TLS_LDM64:
+  case R_390_TLS_LDO64:
+  case R_390_TLS_LE64:
+  case R_390_TLS_DTPMOD:
+  case R_390_TLS_DTPOFF:
+  case R_390_TLS_TPOFF:
+    write64be(loc, val);
+    break;
+  case R_390_TLS_LOAD:
+  case R_390_TLS_GDCALL:
+  case R_390_TLS_LDCALL:
+    break;
+  default:
+    llvm_unreachable("unknown relocation");
+  }
+}
+
+TargetInfo *elf::getSystemZTargetInfo() {
+  static SystemZ t;
+  return &t;
+}
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index de459013595fed..a85bf3aa0c09d1 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -328,9 +328,10 @@ bool X86_64::relaxOnce(int pass) const {
         if (rel.expr != R_RELAX_GOT_PC)
           continue;
 
-        uint64_t v = sec->getRelocTargetVA(
-            sec->file, rel.type, rel.addend,
-            sec->getOutputSection()->addr + rel.offset, *rel.sym, rel.expr);
+        uint64_t v = sec->getRelocTargetVA(sec->file, rel.type, rel.addend,
+                                           sec->getOutputSection()->addr +
+                                               sec->outSecOff + rel.offset,
+                                           *rel.sym, rel.expr);
         if (isInt<32>(v))
           continue;
         if (rel.sym->auxIdx == 0) {
diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
index 475f7dea1dd7e9..83d816ddb0601e 100644
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -33,6 +33,7 @@ add_lld_library(lldELF
   Arch/PPC64.cpp
   Arch/RISCV.cpp
   Arch/SPARCV9.cpp
+  Arch/SystemZ.cpp
   Arch/X86.cpp
   Arch/X86_64.cpp
   ARMErrataFix.cpp
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index f4b7d1c9d5b973..8b2c32b1534821 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -200,6 +200,7 @@ static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(StringRef emul) {
           .Case("msp430elf", {ELF32LEKind, EM_MSP430})
           .Case("elf64_amdgpu", {ELF64LEKind, EM_AMDGPU})
           .Case("elf64loongarch", {ELF64LEKind, EM_LOONGARCH})
+          .Case("elf64_s390", {ELF64BEKind, EM_S390})
           .Default({ELFNoneKind, EM_NONE});
 
   if (ret.first == ELFNoneKind)
@@ -1137,7 +1138,7 @@ static SmallVector<StringRef, 0> getSymbolOrderingFile(MemoryBufferRef mb) {
 static bool getIsRela(opt::InputArgList &args) {
   // The psABI specifies the default relocation entry format.
   bool rela = is_contained({EM_AARCH64, EM_AMDGPU, EM_HEXAGON, EM_LOONGARCH,
-                            EM_PPC, EM_PPC64, EM_RISCV, EM_X86_64},
+                            EM_PPC, EM_PPC64, EM_RISCV, EM_S390, EM_X86_64},
                            config->emachine);
   // If -z rel or -z rela is specified, use the last option.
   for (auto *arg : args.filtered(OPT_z)) {
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 75e5ee1d0da4f5..6c7ef27cbd4942 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1614,6 +1614,8 @@ static uint16_t getBitcodeMachineKind(StringRef path, const Triple &t) {
     return EM_RISCV;
   case Triple::sparcv9:
     return EM_SPARCV9;
+  case Triple::systemz:
+    return EM_S390;
   case Triple::x86:
     return t.isOSIAMCU() ? EM_IAMCU : EM_386;
   case Triple::x86_64:
@@ -1788,7 +1790,12 @@ void BinaryFile::parse() {
 }
 
 InputFile *elf::createInternalFile(StringRef name) {
-  return make<InputFile>(InputFile::InternalKind, MemoryBufferRef("", name));
+  auto *file =
+      make<InputFile>(InputFile::InternalKind, MemoryBufferRef("", name));
+  // References from an internal file do not lead to --warn-backrefs
+  // diagnostics.
+  file->groupId = 0;
+  return file;
 }
 
 ELFFileBase *elf::createObjFile(MemoryBufferRef mb, StringRef archiveName,
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index c728dd6c6306aa..e033a715b59214 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -354,9 +354,10 @@ InputSectionBase *InputSection::getRelocatedSection() const {
 
 template <class ELFT, class RelTy>
 void InputSection::copyRelocations(uint8_t *buf) {
-  if (config->relax && !config->relocatable && config->emachine == EM_RISCV) {
-    // On RISC-V, relaxation might change relocations: copy from internal ones
-    // that are updated by relaxation.
+  if (config->relax && !config->relocatable &&
+      (config->emachine == EM_RISCV || config->emachine == EM_LOONGARCH)) {
+    // On LoongArch and RISC-V, relaxation might change relocations: copy
+    // from internal ones that are updated by relaxation.
     InputSectionBase *sec = getRelocatedSection();
     copyRelocations<ELFT, RelTy>(buf, llvm::make_range(sec->relocations.begin(),
                                                        sec->relocations.end()));
@@ -654,6 +655,7 @@ static int64_t getTlsTpOffset(const Symbol &s) {
 
     // Variant 2.
   case EM_HEXAGON:
+  case EM_S390:
   case EM_SPARCV9:
   case EM_386:
   case EM_X86_64:
@@ -716,6 +718,10 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
   case R_GOT_PC:
   case R_RELAX_TLS_GD_TO_IE:
     return sym.getGotVA() + a - p;
+  case R_GOTPLT_GOTREL:
+    return sym.getGotPltVA() + a - in.got->getVA();
+  case R_GOTPLT_PC:
+    return sym.getGotPltVA() + a - p;
   case R_LOONGARCH_GOT_PAGE_PC:
     if (sym.hasFlag(NEEDS_TLSGD))
       return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p, type);
@@ -807,6 +813,8 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
     return getLoongArchPageDelta(sym.getPltVA() + a, p, type);
   case R_PLT_GOTPLT:
     return sym.getPltVA() + a - in.gotPlt->getVA();
+  case R_PLT_GOTREL:
+    return sym.getPltVA() + a - in.got->getVA();
   case R_PPC32_PLTREL:
     // R_PPC_PLTREL24 uses the addend (usually 0 or 0x8000) to indicate r30
     // stores _GLOBAL_OFFSET_TABLE_ or .got2+0x8000. The addend is ignored for
@@ -961,12 +969,11 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       // vector. The computed value is st_value plus a non-negative offset.
       // Negative values are invalid, so -1 can be used as the tombstone value.
       //
-      // If the referenced symbol is discarded (made Undefined), or the
-      // section defining the referenced symbol is garbage collected,
-      // sym.getOutputSection() is nullptr. `ds->folded` catches the ICF folded
-      // case. However, resolving a relocation in .debug_line to -1 would stop
-      // debugger users from setting breakpoints on the folded-in function, so
-      // exclude .debug_line.
+      // If the referenced symbol is relative to a discarded section (due to
+      // --gc-sections, COMDAT, etc), it has been converted to a Undefined.
+      // `ds->folded` catches the ICF folded case. However, resolving a
+      // relocation in .debug_line to -1 would stop debugger users from setting
+      // breakpoints on the folded-in function, so exclude .debug_line.
       //
       // For pre-DWARF-v5 .debug_loc and .debug_ranges, -1 is a reserved value
       // (base address selection entry), use 1 (which is used by GNU ld for
@@ -974,7 +981,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       //
       // TODO To reduce disruption, we use 0 instead of -1 as the tombstone
       // value. Enable -1 in a future release.
-      if (!sym.getOutputSection() || (ds && ds->folded && !isDebugLine)) {
+      if (!ds || (ds->folded && !isDebugLine)) {
         // If -z dead-reloc-in-nonalloc= is specified, respect it.
         uint64_t value = SignExtend64<bits>(*tombstone);
         // For a 32-bit local TU reference in .debug_names, X86_64::relocate
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index dda4242d8be1c1..243b28d90bb4c1 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -102,7 +102,23 @@ class SectionBase {
         link(link), info(info) {}
 };
 
-struct RISCVRelaxAux;
+struct SymbolAnchor {
+  uint64_t offset;
+  Defined *d;
+  bool end; // true for the anchor of st_value+st_size
+};
+
+struct RelaxAux {
+  // This records symbol start and end offsets which will be adjusted according
+  // to the nearest relocDeltas element.
+  SmallVector<SymbolAnchor, 0> anchors;
+  // For relocations[i], the actual offset is
+  //   r_offset - (i ? relocDeltas[i-1] : 0).
+  std::unique_ptr<uint32_t[]> relocDeltas;
+  // For relocations[i], the actual type is relocTypes[i].
+  std::unique_ptr<RelType[]> relocTypes;
+  SmallVector<uint32_t, 0> writes;
+};
 
 // This corresponds to a section of an input file.
 class InputSectionBase : public SectionBase {
@@ -226,9 +242,9 @@ class InputSectionBase : public SectionBase {
     // basic blocks.
     JumpInstrMod *jumpInstrMod = nullptr;
 
-    // Auxiliary information for RISC-V linker relaxation. RISC-V does not use
-    // jumpInstrMod.
-    RISCVRelaxAux *relaxAux;
+    // Auxiliary information for RISC-V and LoongArch linker relaxation.
+    // They do not use jumpInstrMod.
+    RelaxAux *relaxAux;
 
     // The compressed content size when `compressed` is true.
     size_t compressedSize;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index b6a317bc3b6d69..619fbaf5dc5452 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -203,8 +203,9 @@ static bool isAbsoluteValue(const Symbol &sym) {
 
 // Returns true if Expr refers a PLT entry.
 static bool needsPlt(RelExpr expr) {
-  return oneof<R_PLT, R_PLT_PC, R_PLT_GOTPLT, R_LOONGARCH_PLT_PAGE_PC,
-               R_PPC32_PLTREL, R_PPC64_CALL_PLT>(expr);
+  return oneof<R_PLT, R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT, R_GOTPLT_GOTREL,
+               R_GOTPLT_PC, R_LOONGARCH_PLT_PAGE_PC, R_PPC32_PLTREL,
+               R_PPC64_CALL_PLT>(expr);
 }
 
 bool lld::elf::needsGot(RelExpr expr) {
@@ -233,6 +234,8 @@ static RelExpr toPlt(RelExpr expr) {
     return R_PLT_PC;
   case R_ABS:
     return R_PLT;
+  case R_GOTREL:
+    return R_PLT_GOTREL;
   default:
     return expr;
   }
@@ -253,6 +256,8 @@ static RelExpr fromPlt(RelExpr expr) {
     return R_ABS;
   case R_PLT_GOTPLT:
     return R_GOTPLTREL;
+  case R_PLT_GOTREL:
+    return R_GOTREL;
   default:
     return expr;
   }
@@ -935,7 +940,7 @@ void elf::addGotEntry(Symbol &sym) {
 static void addTpOffsetGotEntry(Symbol &sym) {
   in.got->addEntry(sym);
   uint64_t off = sym.getGotOffset();
-  if (!sym.isPreemptible && !config->isPic) {
+  if (!sym.isPreemptible && !config->shared) {
     in.got->addConstant({R_TPREL, target->symbolicRel, off, 0, &sym});
     return;
   }
@@ -979,10 +984,10 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
   if (oneof<R_GOTPLT, R_GOT_OFF, R_RELAX_HINT, R_MIPS_GOT_LOCAL_PAGE,
             R_MIPS_GOTREL, R_MIPS_GOT_OFF, R_MIPS_GOT_OFF32, R_MIPS_GOT_GP_PC,
             R_AARCH64_GOT_PAGE_PC, R_GOT_PC, R_GOTONLY_PC, R_GOTPLTONLY_PC,
-            R_PLT_PC, R_PLT_GOTPLT, R_PPC32_PLTREL, R_PPC64_CALL_PLT,
-            R_PPC64_RELAX_TOC, R_RISCV_ADD, R_AARCH64_GOT_PAGE,
-            R_LOONGARCH_PLT_PAGE_PC, R_LOONGARCH_GOT, R_LOONGARCH_GOT_PAGE_PC>(
-          e))
+            R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT, R_GOTPLT_GOTREL, R_GOTPLT_PC,
+            R_PPC32_PLTREL, R_PPC64_CALL_PLT, R_PPC64_RELAX_TOC, R_RISCV_ADD,
+            R_AARCH64_GOT_PAGE, R_LOONGARCH_PLT_PAGE_PC, R_LOONGARCH_GOT,
+            R_LOONGARCH_GOT_PAGE_PC>(e))
     return true;
 
   // These never do, except if the entire file is position dependent or if
@@ -1274,29 +1279,34 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
 
   if (config->emachine == EM_MIPS)
     return handleMipsTlsRelocation(type, sym, c, offset, addend, expr);
+  bool isRISCV = config->emachine == EM_RISCV;
 
   if (oneof<R_AARCH64_TLSDESC_PAGE, R_TLSDESC, R_TLSDESC_CALL, R_TLSDESC_PC,
             R_TLSDESC_GOTPLT>(expr) &&
       config->shared) {
+    // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a label. Do not
+    // set NEEDS_TLSDESC on the label.
     if (expr != R_TLSDESC_CALL) {
-      sym.setFlags(NEEDS_TLSDESC);
+      if (!isRISCV || type == R_RISCV_TLSDESC_HI20)
+        sym.setFlags(NEEDS_TLSDESC);
       c.addReloc({expr, type, offset, addend, &sym});
     }
     return 1;
   }
 
   // ARM, Hexagon, LoongArch and RISC-V do not support GD/LD to IE/LE
-  // relaxation.
+  // optimizations.
+  // RISC-V supports TLSDESC to IE/LE optimizations.
   // For PPC64, if the file has missing R_PPC64_TLSGD/R_PPC64_TLSLD, disable
-  // relaxation as well.
-  bool toExecRelax = !config->shared && config->emachine != EM_ARM &&
-                     config->emachine != EM_HEXAGON &&
-                     config->emachine != EM_LOONGARCH &&
-                     config->emachine != EM_RISCV &&
-                     !c.file->ppc64DisableTLSRelax;
+  // optimization as well.
+  bool execOptimize =
+      !config->shared && config->emachine != EM_ARM &&
+      config->emachine != EM_HEXAGON && config->emachine != EM_LOONGARCH &&
+      !(isRISCV && expr != R_TLSDESC_PC && expr != R_TLSDESC_CALL) &&
+      !c.file->ppc64DisableTLSRelax;
 
   // If we are producing an executable and the symbol is non-preemptable, it
-  // must be defined and the code sequence can be relaxed to use Local-Exec.
+  // must be defined and the code sequence can be optimized to use Local-Exec.
   //
   // ARM and RISC-V do not support any relaxations for TLS relocations, however,
   // we can omit the DTPMOD dynamic relocations and resolve them at link time
@@ -1309,8 +1319,8 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
   // module index, with a special value of 0 for the current module. GOT[e1] is
   // unused. There only needs to be one module index entry.
   if (oneof<R_TLSLD_GOT, R_TLSLD_GOTPLT, R_TLSLD_PC, R_TLSLD_HINT>(expr)) {
-    // Local-Dynamic relocs can be relaxed to Local-Exec.
-    if (toExecRelax) {
+    // Local-Dynamic relocs can be optimized to Local-Exec.
+    if (execOptimize) {
       c.addReloc({target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE), type,
                   offset, addend, &sym});
       return target->getTlsGdRelaxSkip(type);
@@ -1322,16 +1332,17 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
     return 1;
   }
 
-  // Local-Dynamic relocs can be relaxed to Local-Exec.
+  // Local-Dynamic relocs can be optimized to Local-Exec.
   if (expr == R_DTPREL) {
-    if (toExecRelax)
+    if (execOptimize)
       expr = target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE);
     c.addReloc({expr, type, offset, addend, &sym});
     return 1;
   }
 
   // Local-Dynamic sequence where offset of tls variable relative to dynamic
-  // thread pointer is stored in the got. This cannot be relaxed to Local-Exec.
+  // thread pointer is stored in the got. This cannot be optimized to
+  // Local-Exec.
   if (expr == R_TLSLD_GOT_OFF) {
     sym.setFlags(NEEDS_GOT_DTPREL);
     c.addReloc({expr, type, offset, addend, &sym});
@@ -1341,14 +1352,18 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
   if (oneof<R_AARCH64_TLSDESC_PAGE, R_TLSDESC, R_TLSDESC_CALL, R_TLSDESC_PC,
             R_TLSDESC_GOTPLT, R_TLSGD_GOT, R_TLSGD_GOTPLT, R_TLSGD_PC,
             R_LOONGARCH_TLSGD_PAGE_PC>(expr)) {
-    if (!toExecRelax) {
+    if (!execOptimize) {
       sym.setFlags(NEEDS_TLSGD);
       c.addReloc({expr, type, offset, addend, &sym});
       return 1;
     }
 
-    // Global-Dynamic relocs can be relaxed to Initial-Exec or Local-Exec
+    // Global-Dynamic/TLSDESC can be optimized to Initial-Exec or Local-Exec
     // depending on the symbol being locally defined or not.
+    //
+    // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a non-preemptible
+    // label, so the LE optimization will be categorized as
+    // R_RELAX_TLS_GD_TO_LE. We fix the categorization in RISCV::relocateAlloc.
     if (sym.isPreemptible) {
       sym.setFlags(NEEDS_TLSGD_TO_IE);
       c.addReloc({target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE), type,
@@ -1363,9 +1378,9 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
   if (oneof<R_GOT, R_GOTPLT, R_GOT_PC, R_AARCH64_GOT_PAGE_PC,
             R_LOONGARCH_GOT_PAGE_PC, R_GOT_OFF, R_TLSIE_HINT>(expr)) {
     ctx.hasTlsIe.store(true, std::memory_order_relaxed);
-    // Initial-Exec relocs can be relaxed to Local-Exec if the symbol is locally
-    // defined.
-    if (toExecRelax && isLocalInExecutable) {
+    // Initial-Exec relocs can be optimized to Local-Exec if the symbol is
+    // locally defined.  This is not supported on SystemZ.
+    if (execOptimize && isLocalInExecutable && config->emachine != EM_S390) {
       c.addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym});
     } else if (expr != R_TLSIE_HINT) {
       sym.setFlags(NEEDS_TLSIE);
@@ -1463,7 +1478,7 @@ template <class ELFT, class RelTy> void RelocationScanner::scanOne(RelTy *&i) {
     in.got->hasGotOffRel.store(true, std::memory_order_relaxed);
   }
 
-  // Process TLS relocations, including relaxing TLS relocations. Note that
+  // Process TLS relocations, including TLS optimizations. Note that
   // R_TPREL and R_TPREL_NEG relocations are resolved in processAux.
   if (sym.isTls()) {
     if (unsigned processed =
@@ -1524,8 +1539,10 @@ void RelocationScanner::scan(ArrayRef<RelTy> rels) {
   // For EhInputSection, OffsetGetter expects the relocations to be sorted by
   // r_offset. In rare cases (.eh_frame pieces are reordered by a linker
   // script), the relocations may be unordered.
+  // On SystemZ, all sections need to be sorted by r_offset, to allow TLS
+  // relaxation to be handled correctly - see SystemZ::getTlsGdRelaxSkip.
   SmallVector<RelTy, 0> storage;
-  if (isa<EhInputSection>(sec))
+  if (isa<EhInputSection>(sec) || config->emachine == EM_S390)
     rels = sortRels(rels, storage);
 
   end = static_cast<const void *>(rels.end());
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index cfb9092149f3e0..7eb8a811e6934f 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -40,11 +40,14 @@ enum RelExpr {
   R_GOTPLT,
   R_GOTPLTREL,
   R_GOTREL,
+  R_GOTPLT_GOTREL,
+  R_GOTPLT_PC,
   R_NONE,
   R_PC,
   R_PLT,
   R_PLT_PC,
   R_PLT_GOTPLT,
+  R_PLT_GOTREL,
   R_RELAX_HINT,
   R_RELAX_GOT_PC,
   R_RELAX_GOT_PC_NOPIC,
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index dd69916d6b05e8..f0ede1f43bbdb3 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -445,6 +445,7 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
       .Case("elf32-msp430", {ELF32LEKind, EM_MSP430})
       .Case("elf32-loongarch", {ELF32LEKind, EM_LOONGARCH})
       .Case("elf64-loongarch", {ELF64LEKind, EM_LOONGARCH})
+      .Case("elf64-s390", {ELF64BEKind, EM_S390})
       .Default({ELFNoneKind, EM_NONE});
 }
 
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 4b413163314b2e..bada394aa30d7d 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1419,6 +1419,9 @@ DynamicSection<ELFT>::computeContents() {
     case EM_MIPS:
       addInSec(DT_MIPS_PLTGOT, *in.gotPlt);
       break;
+    case EM_S390:
+      addInSec(DT_PLTGOT, *in.got);
+      break;
     case EM_SPARCV9:
       addInSec(DT_PLTGOT, *in.plt);
       break;
diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index 671d22cc66a0e9..b7922425a34e43 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -87,6 +87,8 @@ TargetInfo *elf::getTarget() {
     return getRISCVTargetInfo();
   case EM_SPARCV9:
     return getSPARCV9TargetInfo();
+  case EM_S390:
+    return getSystemZTargetInfo();
   case EM_X86_64:
     return getX86_64TargetInfo();
   }
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index ab6b6b9c013ba3..0cefa318135662 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -95,6 +95,8 @@ class TargetInfo {
 
   // Do a linker relaxation pass and return true if we changed something.
   virtual bool relaxOnce(int pass) const { return false; }
+  // Do finalize relaxation after collecting relaxation infos.
+  virtual void finalizeRelax(int passes) const {}
 
   virtual void applyJumpInstrMod(uint8_t *loc, JumpModType type,
                                  JumpModType val) const {}
@@ -186,6 +188,7 @@ TargetInfo *getPPC64TargetInfo();
 TargetInfo *getPPCTargetInfo();
 TargetInfo *getRISCVTargetInfo();
 TargetInfo *getSPARCV9TargetInfo();
+TargetInfo *getSystemZTargetInfo();
 TargetInfo *getX86TargetInfo();
 TargetInfo *getX86_64TargetInfo();
 template <class ELFT> TargetInfo *getMipsTargetInfo();
@@ -236,6 +239,7 @@ void addArmSyntheticSectionMappingSymbol(Defined *);
 void sortArmMappingSymbols();
 void convertArmInstructionstoBE8(InputSection *sec, uint8_t *buf);
 void createTaggedSymbols(const SmallVector<ELFFileBase *, 0> &files);
+void initSymbolAnchors();
 
 LLVM_LIBRARY_VISIBILITY extern const TargetInfo *target;
 TargetInfo *getTarget();
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 6f66f3615fa4a2..8a08b0fcc90dbc 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -261,6 +261,9 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
   Undefined(sym.file, sym.getName(), binding, sym.stOther, sym.type,
             /*discardedSecIdx=*/map.lookup(sym.section))
       .overwrite(sym);
+  // Eliminate from the symbol table, otherwise we would leave an undefined
+  // symbol if the symbol is unreferenced in the absence of GC.
+  sym.isUsedInRegularObj = false;
 }
 
 // If all references to a DSO happen to be weak, the DSO is not added to
@@ -1518,12 +1521,12 @@ template <class ELFT> void Writer<ELFT>::sortSections() {
     if (auto *osd = dyn_cast<OutputDesc>(cmd))
       osd->osec.sortRank = getSectionRank(osd->osec);
   if (!script->hasSectionsCommand) {
-    // We know that all the OutputSections are contiguous in this case.
-    auto isSection = [](SectionCommand *cmd) { return isa<OutputDesc>(cmd); };
-    std::stable_sort(
-        llvm::find_if(script->sectionCommands, isSection),
-        llvm::find_if(llvm::reverse(script->sectionCommands), isSection).base(),
-        compareSections);
+    // OutputDescs are mostly contiguous, but may be interleaved with
+    // SymbolAssignments in the presence of INSERT commands.
+    auto mid = std::stable_partition(
+        script->sectionCommands.begin(), script->sectionCommands.end(),
+        [](SectionCommand *cmd) { return isa<OutputDesc>(cmd); });
+    std::stable_sort(script->sectionCommands.begin(), mid, compareSections);
   }
 
   // Process INSERT commands and update output section attributes. From this
@@ -1752,8 +1755,8 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
       }
     }
   }
-  if (!config->relocatable && config->emachine == EM_RISCV)
-    riscvFinalizeRelax(pass);
+  if (!config->relocatable)
+    target->finalizeRelax(pass);
 
   if (config->relocatable)
     for (OutputSection *sec : outputSections)
diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index 4752d92e3b1d71..7b16764dd2c7ce 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -448,6 +448,10 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     add("-lto-cs-profile-generate");
   if (auto *arg = args.getLastArg(OPT_lto_cs_profile_file))
     add("-lto-cs-profile-file:" + StringRef(arg->getValue()));
+  if (args.hasArg(OPT_plugin_opt_emit_llvm))
+    add("-lldemit:llvm");
+  if (args.hasArg(OPT_lto_emit_asm))
+    add("-lldemit:asm");
 
   if (auto *a = args.getLastArg(OPT_thinlto_cache_dir))
     add("-lldltocache:" + StringRef(a->getValue()));
diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td
index 02f00f27406c08..9a0a96aac7f1c6 100644
--- a/lld/MinGW/Options.td
+++ b/lld/MinGW/Options.td
@@ -158,6 +158,8 @@ def lto_cs_profile_generate: FF<"lto-cs-profile-generate">,
   HelpText<"Perform context sensitive PGO instrumentation">;
 def lto_cs_profile_file: JJ<"lto-cs-profile-file=">,
   HelpText<"Context sensitive profile file path">;
+def lto_emit_asm: FF<"lto-emit-asm">,
+  HelpText<"Emit assembly code">;
 
 def thinlto_cache_dir: JJ<"thinlto-cache-dir=">,
   HelpText<"Path to ThinLTO cached object file directory">;
@@ -181,6 +183,9 @@ def: J<"plugin-opt=cs-profile-path=">,
   Alias<lto_cs_profile_file>, HelpText<"Alias for --lto-cs-profile-file">;
 def plugin_opt_dwo_dir_eq: J<"plugin-opt=dwo_dir=">,
   HelpText<"Directory to store .dwo files when LTO and debug fission are used">;
+def plugin_opt_emit_asm: F<"plugin-opt=emit-asm">,
+  Alias<lto_emit_asm>, HelpText<"Alias for --lto-emit-asm">;
+def plugin_opt_emit_llvm: F<"plugin-opt=emit-llvm">;
 def: J<"plugin-opt=jobs=">, Alias<thinlto_jobs_eq>, HelpText<"Alias for --thinlto-jobs=">;
 def plugin_opt_mcpu_eq: J<"plugin-opt=mcpu=">;
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 01669543cd50ca..6ada711a20a6da 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -29,8 +29,50 @@ ELF Improvements
 * ``--fat-lto-objects`` option is added to support LLVM FatLTO.
   Without ``--fat-lto-objects``, LLD will link LLVM FatLTO objects using the
   relocatable object file. (`D146778 <https://reviews.llvm.org/D146778>`_)
+* ``-Bsymbolic-non-weak`` is added to directly bind non-weak definitions.
+  (`D158322 <https://reviews.llvm.org/D158322>`_)
+* ``--lto-validate-all-vtables-have-type-infos``, which complements
+  ``--lto-whole-program-visibility``, is added to disable unsafe whole-program
+  devirtualization. ``--lto-known-safe-vtables=<glob>`` can be used
+  to mark known-safe vtable symbols.
+  (`D155659 <https://reviews.llvm.org/D155659>`_)
+* ``--save-temps --lto-emit-asm`` now derives ELF/asm file names from bitcode file names.
+  ``ld.lld --save-temps a.o d/b.o -o out`` will create ELF relocatable files
+  ``out.lto.a.o``/``d/out.lto.b.o`` instead of ``out1.lto.o``/``out2.lto.o``.
+  (`#78835 <https://github.com/llvm/llvm-project/pull/78835>`_)
+* ``--no-allow-shlib-undefined`` now reports errors for DSO referencing
+  non-exported definitions.
+  (`#70769 <https://github.com/llvm/llvm-project/pull/70769>`_)
 * common-page-size can now be larger than the system page-size.
   (`#57618 <https://github.com/llvm/llvm-project/issues/57618>`_)
+* When call graph profile information is available due to instrumentation or
+  sample PGO, input sections are now sorted using the new ``cdsort`` algorithm,
+  better than the previous ``hfsort`` algorithm.
+  (`D152840 <https://reviews.llvm.org/D152840>`_)
+* Symbol assignments like ``a = DEFINED(a) ? a : 0;`` are now handled.
+  (`#65866 <https://github.com/llvm/llvm-project/pull/65866>`_)
+* ``OVERLAY`` now supports optional start address and LMA
+  (`#77272 <https://github.com/llvm/llvm-project/pull/77272>`_)
+* Relocations referencing a symbol defined in ``/DISCARD/`` section now lead to
+  an error.
+  (`#69295 <https://github.com/llvm/llvm-project/pull/69295>`_)
+* For AArch64 MTE, global variable descriptors have been implemented.
+  (`D152921 <https://reviews.llvm.org/D152921>`_)
+* ``R_AARCH64_GOTPCREL32`` is now supported.
+  (`#72584 <https://github.com/llvm/llvm-project/pull/72584>`_)
+* ``R_LARCH_PCREL20_S2``/``R_LARCH_ADD6``/``R_LARCH_CALL36`` and extreme code
+  model relocations are now supported.
+* ``--emit-relocs`` is now supported for RISC-V linker relaxation.
+  (`D159082 <https://reviews.llvm.org/D159082>`_)
+* Call relaxation respects RVC when mixing +c and -c relocatable files.
+  (`#73977 <https://github.com/llvm/llvm-project/pull/73977>`_)
+* ``R_RISCV_GOT32_PCREL`` is now supported.
+  (`#72587 <https://github.com/llvm/llvm-project/pull/72587>`_)
+* ``R_RISCV_SET_ULEB128``/``R_RISCV_SUB_ULEB128`` relocations are now supported.
+  (`#72610 <https://github.com/llvm/llvm-project/pull/72610>`_)
+  (`#77261 <https://github.com/llvm/llvm-project/pull/77261>`_)
+* RISC-V TLSDESC is now supported.
+  (`#79239 <https://github.com/llvm/llvm-project/pull/79239>`_)
 
 Breaking changes
 ----------------
@@ -40,10 +82,77 @@ COFF Improvements
 
 * Added support for ``--time-trace`` and associated ``--time-trace-granularity``.
   This generates a .json profile trace of the linker execution.
+  (`#68236 <https://github.com/llvm/llvm-project/pull/68236>`_)
+
+* The ``-dependentloadflag`` option was implemented.
+  (`#71537 <https://github.com/llvm/llvm-project/pull/71537>`_)
+
+* LLD now prefers library paths specified with ``-libpath:`` over the implicitly
+  detected toolchain paths.
+  (`#78039 <https://github.com/llvm/llvm-project/pull/78039>`_)
+
+* Added new options ``-lldemit:llvm`` and ``-lldemit:asm`` for getting
+  the output of LTO compilation as LLVM bitcode or assembly.
+  (`#66964 <https://github.com/llvm/llvm-project/pull/66964>`_)
+  (`#67079 <https://github.com/llvm/llvm-project/pull/67079>`_)
+
+* Added a new option ``-build-id`` for generating a ``.buildid`` section
+  when not generating a PDB. A new symbol ``__buildid`` is generated by
+  the linker, allowing code to reference the build ID of the binary.
+  (`#71433 <https://github.com/llvm/llvm-project/pull/71433>`_)
+  (`#74652 <https://github.com/llvm/llvm-project/pull/74652>`_)
+
+* A new, LLD specific option, ``-lld-allow-duplicate-weak``, was added
+  for allowing duplicate weak symbols.
+  (`#68077 <https://github.com/llvm/llvm-project/pull/68077>`_)
+
+* More correctly handle LTO of files that define ``__imp_`` prefixed dllimport
+  redirections.
+  (`#70777 <https://github.com/llvm/llvm-project/pull/70777>`_)
+  (`#71376 <https://github.com/llvm/llvm-project/pull/71376>`_)
+  (`#72989 <https://github.com/llvm/llvm-project/pull/72989>`_)
+
+* Linking undefined references to weak symbols with LTO now works.
+  (`#70430 <https://github.com/llvm/llvm-project/pull/70430>`_)
+
+* Use the ``SOURCE_DATE_EPOCH`` environment variable for the PE header and
+  debug directory timestamps, if neither the ``/Brepro`` nor ``/timestamp:``
+  options have been specified. This makes the linker output reproducible by
+  setting this environment variable.
+  (`#81326 <https://github.com/llvm/llvm-project/pull/81326>`_)
+
+* Lots of incremental work towards supporting linking ARM64EC binaries.
 
 MinGW Improvements
 ------------------
 
+* Added support for many LTO and ThinLTO options (most LTO options supported
+  by the ELF driver, that are implemented by the COFF backend as well,
+  should be supported now).
+  (`D158412 <https://reviews.llvm.org/D158412>`_)
+  (`D158887 <https://reviews.llvm.org/D158887>`_)
+  (`#77387 <https://github.com/llvm/llvm-project/pull/77387>`_)
+  (`#81475 <https://github.com/llvm/llvm-project/pull/81475>`_)
+
+* LLD no longer tries to autodetect and use library paths from MSVC/WinSDK
+  installations when run in MinGW mode; that mode of operation shouldn't
+  ever be needed in MinGW mode, and could be a source of unexpected
+  behaviours.
+  (`D144084 <https://reviews.llvm.org/D144084>`_)
+
+* The ``--icf=safe`` option now works as expected; it was previously a no-op.
+  (`#70037 <https://github.com/llvm/llvm-project/pull/70037>`_)
+
+* The strip flags ``-S`` and ``-s`` now can be used to strip out DWARF debug
+  info and symbol tables while emitting a PDB debug info file.
+  (`#75181 <https://github.com/llvm/llvm-project/pull/75181>`_)
+
+* The option ``--dll`` is handled as an alias for the ``--shared`` option.
+  (`#68575 <https://github.com/llvm/llvm-project/pull/68575>`_)
+
+* The option ``--sort-common`` is ignored now.
+  (`#66336 <https://github.com/llvm/llvm-project/pull/66336>`_)
+
 MachO Improvements
 ------------------
 
@@ -54,5 +163,10 @@ WebAssembly Improvements
   is read from object files within the archive.  This matches the behaviour of
   the ELF linker.
 
+SystemZ
+-------
+
+* Add target support for SystemZ (s390x).
+
 Fixes
 #####
diff --git a/lld/test/COFF/autoimport-gc.s b/lld/test/COFF/autoimport-gc.s
new file mode 100644
index 00000000000000..fef6c02eba82f9
--- /dev/null
+++ b/lld/test/COFF/autoimport-gc.s
@@ -0,0 +1,41 @@
+# REQUIRES: x86
+# RUN: split-file %s %t.dir
+
+# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/lib.s -filetype=obj -o %t.dir/lib.obj
+# RUN: lld-link -out:%t.dir/lib.dll -dll -entry:DllMainCRTStartup %t.dir/lib.obj -lldmingw -implib:%t.dir/lib.lib
+
+# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/main.s -filetype=obj -o %t.dir/main.obj
+# RUN: lld-link -lldmingw -out:%t.dir/main.exe -entry:main %t.dir/main.obj %t.dir/lib.lib -opt:ref -debug:dwarf
+
+#--- main.s
+    .global main
+    .section .text$main,"xr",one_only,main
+main:
+    ret
+
+    .global other
+    .section .text$other,"xr",one_only,other
+other:
+    movq .refptr.variable(%rip), %rax
+    movl (%rax), %eax
+    ret
+
+    .section .rdata$.refptr.variable,"dr",discard,.refptr.variable
+    .global .refptr.variable
+.refptr.variable:
+    .quad   variable
+
+    .section .debug_info
+    .long 1
+    .quad variable
+    .long 2
+
+#--- lib.s
+    .global variable
+    .global DllMainCRTStartup
+    .text
+DllMainCRTStartup:
+    ret
+    .data
+variable:
+    .long 42
diff --git a/lld/test/COFF/def-export-cpp.s b/lld/test/COFF/def-export-cpp.s
index e00b35b1c5b39b..370b8ddba4104b 100644
--- a/lld/test/COFF/def-export-cpp.s
+++ b/lld/test/COFF/def-export-cpp.s
@@ -10,6 +10,7 @@
 
 # IMPLIB: File: foo.dll
 # IMPLIB: Name type: undecorate
+# IMPLIB-NEXT: Export name: GetPathOnDisk
 # IMPLIB-NEXT: Symbol: __imp_?GetPathOnDisk@@YA_NPEA_W@Z
 # IMPLIB-NEXT: Symbol: ?GetPathOnDisk@@YA_NPEA_W@Z
 
diff --git a/lld/test/COFF/def-export-stdcall.s b/lld/test/COFF/def-export-stdcall.s
index f015e205c74a33..7e4e04c77cbe7a 100644
--- a/lld/test/COFF/def-export-stdcall.s
+++ b/lld/test/COFF/def-export-stdcall.s
@@ -6,15 +6,19 @@
 # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix UNDECORATED-EXPORTS %s
 
 # UNDECORATED-IMPLIB: Name type: noprefix
+# UNDECORATED-IMPLIB-NEXT: Export name: _underscored
 # UNDECORATED-IMPLIB-NEXT: __imp___underscored
 # UNDECORATED-IMPLIB-NEXT: __underscored
 # UNDECORATED-IMPLIB: Name type: undecorate
+# UNDECORATED-IMPLIB-NEXT: Export name: fastcall
 # UNDECORATED-IMPLIB-NEXT: __imp_@fastcall@8
 # UNDECORATED-IMPLIB-NEXT: fastcall@8
 # UNDECORATED-IMPLIB: Name type: undecorate
+# UNDECORATED-IMPLIB-NEXT: Export name: stdcall
 # UNDECORATED-IMPLIB-NEXT: __imp__stdcall@8
 # UNDECORATED-IMPLIB-NEXT: _stdcall@8
 # UNDECORATED-IMPLIB: Name type: undecorate
+# UNDECORATED-IMPLIB-NEXT: Export name: vectorcall
 # UNDECORATED-IMPLIB-NEXT: __imp_vectorcall@@8
 # UNDECORATED-IMPLIB-NEXT: vectorcall@@8
 
@@ -30,12 +34,15 @@
 # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix DECORATED-EXPORTS %s
 
 # DECORATED-IMPLIB: Name type: name
+# DECORATED-IMPLIB-NEXT: Export name: @fastcall@8
 # DECORATED-IMPLIB-NEXT: __imp_@fastcall@8
 # DECORATED-IMPLIB-NEXT: @fastcall@8
 # DECORATED-IMPLIB: Name type: name
+# DECORATED-IMPLIB-NEXT: Export name: _stdcall@8
 # DECORATED-IMPLIB-NEXT: __imp__stdcall@8
 # DECORATED-IMPLIB-NEXT: _stdcall@8
 # DECORATED-IMPLIB: Name type: name
+# DECORATED-IMPLIB-NEXT: Export name: vectorcall@@8
 # DECORATED-IMPLIB-NEXT: __imp_vectorcall@@8
 # DECORATED-IMPLIB-NEXT: vectorcall@@8
 
@@ -51,14 +58,17 @@
 # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix DECORATED-MINGW-EXPORTS %s
 
 # DECORATED-MINGW-IMPLIB: Name type: name
+# DECORATED-MINGW-IMPLIB-NEXT: Export name: @fastcall@8
 # DECORATED-MINGW-IMPLIB-NEXT: __imp_@fastcall@8
 # DECORATED-MINGW-IMPLIB-NEXT: fastcall@8
 # DECORATED-MINGW-IMPLIB: Name type: noprefix
+# DECORATED-MINGW-IMPLIB-NEXT: Export name: stdcall@8
 # DECORATED-MINGW-IMPLIB-NEXT: __imp__stdcall@8
 # DECORATED-MINGW-IMPLIB-NEXT: _stdcall@8
 # GNU tools don't support vectorcall, but this test is just to track that
 # lld's behaviour remains consistent over time.
 # DECORATED-MINGW-IMPLIB: Name type: name
+# DECORATED-MINGW-IMPLIB-NEXT: Export name: vectorcall@@8
 # DECORATED-MINGW-IMPLIB-NEXT: __imp_vectorcall@@8
 # DECORATED-MINGW-IMPLIB-NEXT: vectorcall@@8
 
@@ -75,14 +85,17 @@
 # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix MINGW-KILL-AT-EXPORTS %s
 
 # MINGW-KILL-AT-IMPLIB: Name type: noprefix
+# MINGW-KILL-AT-IMPLIB: Export name: fastcall
 # MINGW-KILL-AT-IMPLIB: __imp__fastcall
 # MINGW-KILL-AT-IMPLIB-NEXT: _fastcall
 # MINGW-KILL-AT-IMPLIB: Name type: noprefix
+# MINGW-KILL-AT-IMPLIB-NEXT: Export name: stdcall
 # MINGW-KILL-AT-IMPLIB-NEXT: __imp__stdcall
 # MINGW-KILL-AT-IMPLIB-NEXT: _stdcall
 # GNU tools don't support vectorcall, but this test is just to track that
 # lld's behaviour remains consistent over time.
 # MINGW-KILL-AT-IMPLIB: Name type: noprefix
+# MINGW-KILL-AT-IMPLIB-NEXT: Export name: vectorcall
 # MINGW-KILL-AT-IMPLIB-NEXT: __imp__vectorcall
 # MINGW-KILL-AT-IMPLIB-NEXT: _vectorcall
 
diff --git a/lld/test/COFF/delayimports-armnt.yaml b/lld/test/COFF/delayimports-armnt.yaml
index 7d9bc38c5c3606..ea96d864ef53d5 100644
--- a/lld/test/COFF/delayimports-armnt.yaml
+++ b/lld/test/COFF/delayimports-armnt.yaml
@@ -6,6 +6,7 @@
 # RUN: llvm-readobj --coff-imports %t.exe | FileCheck -check-prefix=IMPORT %s
 # RUN: llvm-readobj --coff-basereloc %t.exe | FileCheck -check-prefix=BASEREL %s
 # RUN: llvm-objdump --no-print-imm-hex -d %t.exe | FileCheck --check-prefix=DISASM %s
+# RUN: llvm-readobj --file-headers %t.exe | FileCheck -check-prefix=DIR %s
 
 # IMPORT:      Format: COFF-ARM
 # IMPORT-NEXT: Arch: thumb
@@ -13,9 +14,9 @@
 # IMPORT-NEXT: DelayImport {
 # IMPORT-NEXT:   Name: library.dll
 # IMPORT-NEXT:   Attributes: 0x1
-# IMPORT-NEXT:   ModuleHandle: 0x3000
-# IMPORT-NEXT:   ImportAddressTable: 0x3008
-# IMPORT-NEXT:   ImportNameTable: 0x2040
+# IMPORT-NEXT:   ModuleHandle: 0x3008
+# IMPORT-NEXT:   ImportAddressTable: 0x3010
+# IMPORT-NEXT:   ImportNameTable: 0x2044
 # IMPORT-NEXT:   BoundDelayImportTable: 0x0
 # IMPORT-NEXT:   UnloadDelayImportTable: 0x0
 # IMPORT-NEXT:   Import {
@@ -43,7 +44,7 @@
 # BASEREL-NEXT:   }
 # BASEREL-NEXT:   Entry {
 # BASEREL-NEXT:     Type: HIGHLOW
-# BASEREL-NEXT:     Address: 0x3008
+# BASEREL-NEXT:     Address: 0x3010
 # BASEREL-NEXT:   }
 # BASEREL-NEXT:   Entry {
 # BASEREL-NEXT:     Type: ABSOLUTE
@@ -52,20 +53,24 @@
 # BASEREL-NEXT: ]
 #
 # DISASM:    00401000 <.text>:
-# DISASM:      40100c:       f243 0c08       movw r12, #12296
+# DISASM:      40100c:       f243 0c10       movw r12, #12304
 # DISASM-NEXT:               f2c0 0c40       movt    r12, #64
 # DISASM-NEXT:               f000 b800       b.w     {{.+}} @ imm = #0
 # DISASM-NEXT:               e92d 480f       push.w  {r0, r1, r2, r3, r11, lr}
 # DISASM-NEXT:               f20d 0b10       addw    r11, sp, #16
 # DISASM-NEXT:               ed2d 0b10       vpush   {d0, d1, d2, d3, d4, d5, d6, d7}
 # DISASM-NEXT:               4661            mov     r1, r12
-# DISASM-NEXT:               f242 0000       movw r0, #8192
+# DISASM-NEXT:               f242 0004       movw r0, #8196
 # DISASM-NEXT:               f2c0 0040       movt    r0, #64
 # DISASM-NEXT:               f7ff ffe7       bl      0x401000 <.text>
 # DISASM-NEXT:               4684            mov     r12, r0
 # DISASM-NEXT:               ecbd 0b10       vpop    {d0, d1, d2, d3, d4, d5, d6, d7}
 # DISASM-NEXT:               e8bd 480f       pop.w   {r0, r1, r2, r3, r11, lr}
 # DISASM-NEXT:               4760            bx      r12
+#
+# DIR:         DelayImportDescriptorRVA: 0x2004
+# DIR-NEXT:    DelayImportDescriptorSize: 0x40
+
 
 --- !COFF
 header:
@@ -80,6 +85,14 @@ sections:
       - VirtualAddress:  0
         SymbolName:      __imp_function
         Type:            IMAGE_REL_ARM_MOV32T
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     01
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       1
+    SectionData:     02
 symbols:
   - Name:            .text
     Value:           0
diff --git a/lld/test/COFF/dllexport.s b/lld/test/COFF/dllexport.s
index a238b70ce1b4f6..b04ebc3a33c3e2 100644
--- a/lld/test/COFF/dllexport.s
+++ b/lld/test/COFF/dllexport.s
@@ -6,15 +6,19 @@
 # RUN: llvm-readobj --coff-exports %t.dll | FileCheck -check-prefix DECORATED-EXPORTS %s
 
 # DECORATED-IMPLIB: Name type: name
+# DECORATED-IMPLIB-NEXT: Export name: @fastcall@8
 # DECORATED-IMPLIB-NEXT: __imp_@fastcall@8
 # DECORATED-IMPLIB-NEXT: @fastcall@8
 # DECORATED-IMPLIB: Name type: name
+# DECORATED-IMPLIB-NEXT: Export name: _stdcall@8
 # DECORATED-IMPLIB-NEXT: __imp__stdcall@8
 # DECORATED-IMPLIB-NEXT: _stdcall@8
 # DECORATED-IMPLIB: Name type: noprefix
+# DECORATED-IMPLIB-NEXT: Export name: _underscored
 # DECORATED-IMPLIB-NEXT: __imp___underscored
 # DECORATED-IMPLIB-NEXT: __underscored
 # DECORATED-IMPLIB: Name type: name
+# DECORATED-IMPLIB-NEXT: Export name: vectorcall@@8
 # DECORATED-IMPLIB-NEXT: __imp_vectorcall@@8
 # DECORATED-IMPLIB-NEXT: vectorcall@@8
 
diff --git a/lld/test/COFF/lto-cache-errors.ll b/lld/test/COFF/lto-cache-errors.ll
index 55244e5690dc34..a46190a81b6230 100644
--- a/lld/test/COFF/lto-cache-errors.ll
+++ b/lld/test/COFF/lto-cache-errors.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 ;; Not supported on windows since we use permissions to deny the creation
 ; UNSUPPORTED: system-windows
 
diff --git a/lld/test/COFF/thinlto-emit-imports.ll b/lld/test/COFF/thinlto-emit-imports.ll
index a9f22c1dc2dcff..b47a6cea4eb7df 100644
--- a/lld/test/COFF/thinlto-emit-imports.ll
+++ b/lld/test/COFF/thinlto-emit-imports.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 
 ; Generate summary sections and test lld handling.
 ; RUN: opt -module-summary %s -o %t1.obj
diff --git a/lld/test/COFF/timestamp.test b/lld/test/COFF/timestamp.test
index fbdc5788a33a55..cc73af13c38ca6 100644
--- a/lld/test/COFF/timestamp.test
+++ b/lld/test/COFF/timestamp.test
@@ -3,9 +3,28 @@ RUN: yaml2obj %p/Inputs/generic.yaml -o %t.obj
 RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.1.exe
 RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.2.exe
 RUN: lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.3.exe
+RUN: env SOURCE_DATE_EPOCH=0 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.4.exe
+# Test timestamps corresponding to INT32_TMAX
+RUN: lld-link %t.obj /debug /timestamp:2147483647 /entry:main /nodefaultlib /out:%t.5.exe
+RUN: env SOURCE_DATE_EPOCH=2147483647 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.6.exe
+# Test that the command line option /timestamp has precedence over SOURCE_DATE_EPOCH
+RUN: env SOURCE_DATE_EPOCH=12345 lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.7.exe
+# Test timestamps corresponding to UINT32_TMAX
+RUN: lld-link %t.obj /debug /timestamp:4294967295 /entry:main /nodefaultlib /out:%t.8.exe
+RUN: env SOURCE_DATE_EPOCH=4294967295 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.9.exe
+# Test that setting UINT32_MAX+1 as timestamp fails.
+RUN: env LLD_IN_TEST=1 not lld-link %t.obj /debug /timestamp:4294967296 /entry:main /nodefaultlib /out:%t.10.exe 2>&1 | FileCheck %s --check-prefix=ERROR
+RUN: env SOURCE_DATE_EPOCH=4294967296 env LLD_IN_TEST=1 not lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.11.exe 2>&1 | FileCheck %s --check-prefix=ERROR2
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.1.exe | FileCheck %s --check-prefix=HASH
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.2.exe | FileCheck %s --check-prefix=HASH
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.3.exe | FileCheck %s --check-prefix=ZERO
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.4.exe | FileCheck %s --check-prefix=ZERO
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.5.exe | FileCheck %s --check-prefix=LARGE
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.6.exe | FileCheck %s --check-prefix=LARGE
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.7.exe | FileCheck %s --check-prefix=ZERO
+
+# Not inspecting %t.8.exe and %t.9.exe; llvm-readobj with a 32 bit time_t fails to print dates
+# past INT32_MAX correctly.
 
 HASH: ImageFileHeader {
 HASH: TimeDateStamp: [[STAMP:.*]]
@@ -16,3 +35,11 @@ ZERO: ImageFileHeader {
 ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0)
 ZERO: DebugDirectory [
 ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0)
+
+LARGE: ImageFileHeader {
+LARGE: TimeDateStamp: 2038-01-19 03:14:07 (0x7FFFFFFF)
+LARGE: DebugDirectory [
+LARGE: TimeDateStamp: 2038-01-19 03:14:07 (0x7FFFFFFF)
+
+ERROR: error: invalid timestamp: 4294967296.  Expected 32-bit integer
+ERROR2: error: invalid SOURCE_DATE_EPOCH timestamp: 4294967296.  Expected 32-bit integer
diff --git a/lld/test/ELF/Inputs/systemz-init.s b/lld/test/ELF/Inputs/systemz-init.s
new file mode 100644
index 00000000000000..1611b69b4419e3
--- /dev/null
+++ b/lld/test/ELF/Inputs/systemz-init.s
@@ -0,0 +1,5 @@
+// glibc < 2.39 used to align .init and .fini code at a 4-byte boundary.
+// This file aims to recreate that behavior.
+	.section        .init,"ax",@progbits
+	.align	4
+	lg %r4, 272(%r15)
diff --git a/lld/test/ELF/basic-systemz.s b/lld/test/ELF/basic-systemz.s
new file mode 100644
index 00000000000000..f7bb0e8cbd020d
--- /dev/null
+++ b/lld/test/ELF/basic-systemz.s
@@ -0,0 +1,63 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld --hash-style=sysv -discard-all -shared %t.o -o %t.so
+# RUN: llvm-readelf --file-header --program-headers --section-headers --dynamic-table %t.so | FileCheck %s
+
+# Exits with return code 55 on linux.
+.text
+  lghi 2,55
+  svc 1
+
+# CHECK:       ELF Header:
+# CHECK-NEXT:  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
+# CHECK-NEXT:  Class:                             ELF64
+# CHECK-NEXT:  Data:                              2's complement, big endian
+# CHECK-NEXT:  Version:                           1 (current)
+# CHECK-NEXT:  OS/ABI:                            UNIX - System V
+# CHECK-NEXT:  ABI Version:                       0
+# CHECK-NEXT:  Type:                              DYN (Shared object file)
+# CHECK-NEXT:  Machine:                           IBM S/390
+# CHECK-NEXT:  Version:                           0x1
+# CHECK-NEXT:  Entry point address:               0x0
+# CHECK-NEXT:  Start of program headers:          64 (bytes into file)
+# CHECK-NEXT:  Start of section headers:          768 (bytes into file)
+# CHECK-NEXT:  Flags:                             0x0
+# CHECK-NEXT:  Size of this header:               64 (bytes)
+# CHECK-NEXT:  Size of program headers:           56 (bytes)
+# CHECK-NEXT:  Number of program headers:         7
+# CHECK-NEXT:  Size of section headers:           64 (bytes)
+# CHECK-NEXT:  Number of section headers:         11
+# CHECK-NEXT:  Section header string table index: 9
+
+# CHECK:       Section Headers:
+# CHECK-NEXT:  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK-NEXT:  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK-NEXT:  [ 1] .dynsym           DYNSYM          00000000000001c8 0001c8 000018 18   A  3   1  8
+# CHECK-NEXT:  [ 2] .hash             HASH            00000000000001e0 0001e0 000010 04   A  1   0  4
+# CHECK-NEXT:  [ 3] .dynstr           STRTAB          00000000000001f0 0001f0 000001 00   A  0   0  1
+# CHECK-NEXT:  [ 4] .text             PROGBITS        00000000000011f4 0001f4 000006 00  AX  0   0  4
+# CHECK-NEXT:  [ 5] .dynamic          DYNAMIC         0000000000002200 000200 000060 10  WA  3   0  8
+# CHECK-NEXT:  [ 6] .relro_padding    NOBITS          0000000000002260 000260 000da0 00  WA  0   0  1
+# CHECK-NEXT:  [ 7] .comment          PROGBITS        0000000000000000 000260 000008 01  MS  0   0  1
+# CHECK-NEXT:  [ 8] .symtab           SYMTAB          0000000000000000 000268 000030 18     10   2  8
+# CHECK-NEXT:  [ 9] .shstrtab         STRTAB          0000000000000000 000298 000058 00      0   0  1
+# CHECK-NEXT:  [10] .strtab           STRTAB          0000000000000000 0002f0 00000a 00      0   0  1
+
+# CHECK:       Program Headers:
+# CHECK-NEXT:  Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT:  PHDR           0x000040 0x0000000000000040 0x0000000000000040 0x000188 0x000188 R   0x8
+# CHECK-NEXT:  LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R   0x1000
+# CHECK-NEXT:  LOAD           0x0001f4 0x00000000000011f4 0x00000000000011f4 0x000006 0x000006 R E 0x1000
+# CHECK-NEXT:  LOAD           0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000e00 RW  0x1000
+# CHECK-NEXT:  DYNAMIC        0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000060 RW  0x8
+# CHECK-NEXT:  GNU_RELRO      0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000e00 R   0x1
+# CHECK-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0x0
+
+# CHECK:       Dynamic section at offset 0x200 contains 6 entries:
+# CHECK-NEXT:  Tag                Type     Name/Value
+# CHECK-NEXT:  0x0000000000000006 (SYMTAB) 0x1c8
+# CHECK-NEXT:  0x000000000000000b (SYMENT) 24 (bytes)
+# CHECK-NEXT:  0x0000000000000005 (STRTAB) 0x1f0
+# CHECK-NEXT:  0x000000000000000a (STRSZ)  1 (bytes)
+# CHECK-NEXT:  0x0000000000000004 (HASH)   0x1e0
+# CHECK-NEXT:  0x0000000000000000 (NULL)   0x0
diff --git a/lld/test/ELF/dead-reloc-in-nonalloc.s b/lld/test/ELF/dead-reloc-in-nonalloc.s
index 145604eb883a9a..b675fc50fc2ea2 100644
--- a/lld/test/ELF/dead-reloc-in-nonalloc.s
+++ b/lld/test/ELF/dead-reloc-in-nonalloc.s
@@ -17,7 +17,7 @@
 # AA:          Contents of section .debug_info:
 # AA-NEXT:      0000 [[ADDR]] 00000000 aaaaaaaa 00000000
 # AA:          Contents of section .not_debug:
-# AA-NEXT:      0000 bbbbbbbb bbbbbbbb 00000000          .
+# AA-NEXT:      0000 bbbbbbbb 2a000000 00000000          .
 
 ## Specifying zero can get a behavior similar to GNU ld.
 # RUN: ld.lld --icf=all -z dead-reloc-in-nonalloc=.debug_info=0 %t.o %tabs.o -o %tzero
diff --git a/lld/test/ELF/emulation-systemz.s b/lld/test/ELF/emulation-systemz.s
new file mode 100644
index 00000000000000..dfdb4620954c8a
--- /dev/null
+++ b/lld/test/ELF/emulation-systemz.s
@@ -0,0 +1,29 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -m elf64_s390 %t.o -o %t1
+# RUN: llvm-readelf --file-header %t1 | FileCheck %s
+# RUN: ld.lld %t.o -o %t2
+# RUN: llvm-readelf --file-header %t2 | FileCheck %s
+# RUN: echo 'OUTPUT_FORMAT(elf64-s390)' > %t.script
+# RUN: ld.lld %t.script %t.o -o %t3
+# RUN: llvm-readelf --file-header %t3 | FileCheck %s
+
+# CHECK:       ELF Header:
+# CHECK-NEXT:  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
+# CHECK-NEXT:  Class:                             ELF64
+# CHECK-NEXT:  Data:                              2's complement, big endian
+# CHECK-NEXT:  Version:                           1 (current)
+# CHECK-NEXT:  OS/ABI:                            UNIX - System V
+# CHECK-NEXT:  ABI Version:                       0
+# CHECK-NEXT:  Type:                              EXEC (Executable file)
+# CHECK-NEXT:  Machine:                           IBM S/390
+# CHECK-NEXT:  Version:                           0x1
+# CHECK-NEXT:  Entry point address:
+# CHECK-NEXT:  Start of program headers:          64 (bytes into file)
+# CHECK-NEXT:  Start of section headers:
+# CHECK-NEXT:  Flags:                             0x0
+# CHECK-NEXT:  Size of this header:               64 (bytes)
+# CHECK-NEXT:  Size of program headers:           56 (bytes)
+
+.globl _start
+_start:
diff --git a/lld/test/ELF/linkerscript/discard-section.s b/lld/test/ELF/linkerscript/discard-section.s
index 24f3b2b73e991f..0bbebac59bb345 100644
--- a/lld/test/ELF/linkerscript/discard-section.s
+++ b/lld/test/ELF/linkerscript/discard-section.s
@@ -9,6 +9,9 @@
 # RUN: ld.lld -r -T a.lds a.o b.o -o a.ro 2>&1 | FileCheck %s --check-prefix=WARNING --implicit-check-not=warning:
 # RUN: llvm-readelf -r -s a.ro | FileCheck %s --check-prefix=RELOC
 
+# RUN: ld.lld -r --gc-sections -T a.lds a.o b.o -o a.gc.ro --no-fatal-warnings
+# RUN: llvm-readelf -r -s a.gc.ro | FileCheck %s --check-prefix=RELOC-GC
+
 # LOCAL:      error: relocation refers to a discarded section: .aaa
 # LOCAL-NEXT: >>> defined in a.o
 # LOCAL-NEXT: >>> referenced by a.o:(.bbb+0x0)
@@ -32,16 +35,18 @@
 # WARNING:      warning: relocation refers to a discarded section: .aaa
 # WARNING-NEXT: >>> referenced by a.o:(.rela.bbb+0x0)
 
+## GNU ld reports "defined in discarded secion" errors even in -r mode.
+## We set the symbol index to 0.
 # RELOC:      Relocation section '.rela.bbb' at offset {{.*}} contains 1 entries:
 # RELOC-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
 # RELOC-NEXT: 0000000000000000  0000000000000000 R_X86_64_NONE                             0
 # RELOC-EMPTY:
 # RELOC-NEXT: Relocation section '.rela.data' at offset {{.*}} contains 4 entries:
 # RELOC-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-# RELOC-NEXT: 0000000000000000  0000000500000001 R_X86_64_64            0000000000000000 global + 0
-# RELOC-NEXT: 0000000000000008  0000000700000001 R_X86_64_64            0000000000000000 weak + 0
-# RELOC-NEXT: 0000000000000010  0000000600000001 R_X86_64_64            0000000000000000 weakref1 + 0
-# RELOC-NEXT: 0000000000000018  0000000800000001 R_X86_64_64            0000000000000000 weakref2 + 0
+# RELOC-NEXT: 0000000000000000  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000008  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000010  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000018  0000000000000001 R_X86_64_64                             0
 
 # RELOC:      Num:    Value          Size Type    Bind   Vis      Ndx Name
 # RELOC-NEXT:   0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND
@@ -49,23 +54,25 @@
 # RELOC-NEXT:   2: 0000000000000000     0 SECTION LOCAL  DEFAULT    2 .bbb
 # RELOC-NEXT:   3: 0000000000000000     0 SECTION LOCAL  DEFAULT    4 .data
 # RELOC-NEXT:   4: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT    1 _start
-# RELOC-NEXT:   5: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND global
-# RELOC-NEXT:   6: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weakref1
-# RELOC-NEXT:   7: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weak
-# RELOC-NEXT:   8: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weakref2
 # RELOC-EMPTY:
 
+# RELOC-GC:   There are no relocations in this file.
+
 #--- a.s
 .globl _start
 _start:
 
 .section .aaa,"a"
-.globl global, weakref1
+.globl global, weakref1, unused
 .weak weak, weakref2
 global:
 weak:
 weakref1:
 weakref2:
+## Eliminate `unused` just like GC discarded definitions.
+## Linux kernel's CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y configuration expects
+## that the unreferenced `unused` is not emitted to .symtab.
+unused:
   .quad 0
 
 .section .bbb,"aw"
diff --git a/lld/test/ELF/linkerscript/insert-before.test b/lld/test/ELF/linkerscript/insert-before.test
index e6ed413639827a..a72834988007ce 100644
--- a/lld/test/ELF/linkerscript/insert-before.test
+++ b/lld/test/ELF/linkerscript/insert-before.test
@@ -24,8 +24,9 @@
 ## without making more layout changes. Address/offset assignments are different
 ## with a main linker script.
 
-# RUN: ld.lld --script %s %t1.o -o %t2
-# RUN: llvm-readelf -S -l %t2 | FileCheck --check-prefix=CHECK2 %s
+## Test non-contiguous OutputDescs in script->sectionCommands.
+# RUN: ld.lld --defsym y0=1 %s --defsym y1=1 %t1.o -o %t2
+# RUN: llvm-readelf -S -l -sX %t2 | FileCheck --check-prefix=CHECK2 %s
 # CHECK2:      Name      Type     Address            Off      Size   ES Flg
 # CHECK2-NEXT:           NULL
 # CHECK2-NEXT: .foo.text PROGBITS 000000000020{{.*}} [[#%x,]] 000008 00  AX
@@ -40,9 +41,13 @@
 # CHECK2-NEXT: LOAD      {{.*}} RW  0x1000
 # CHECK2-NEXT: GNU_STACK {{.*}} RW  0
 
+# CHECK2:      NOTYPE  GLOBAL DEFAULT  ABS               y0
+# CHECK2:      NOTYPE  GLOBAL DEFAULT  [[#]] (.foo.text) x0
+# CHECK2:      NOTYPE  GLOBAL DEFAULT  ABS               y1
+
 SECTIONS { .byte : { BYTE(0) } } INSERT BEFORE .data;
 
 SECTIONS { .foo.data : { *(.foo.data) } } INSERT BEFORE .data;
 
 ## The input section .foo.text is an orphan. It will be placed in .foo.text
-SECTIONS { .foo.text : {} } INSERT BEFORE .text;
+SECTIONS { .foo.text : { x0 = .; } } INSERT BEFORE .text;
diff --git a/lld/test/ELF/loongarch-relax-align.s b/lld/test/ELF/loongarch-relax-align.s
new file mode 100644
index 00000000000000..ab61e15d5caca2
--- /dev/null
+++ b/lld/test/ELF/loongarch-relax-align.s
@@ -0,0 +1,126 @@
+# REQUIRES: loongarch
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax %s -o %t.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.64.o
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.32.o -o %t.32
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.64.o -o %t.64
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.32.o --no-relax -o %t.32n
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.64.o --no-relax -o %t.64n
+# RUN: llvm-objdump -td --no-show-raw-insn %t.32 | FileCheck %s
+# RUN: llvm-objdump -td --no-show-raw-insn %t.64 | FileCheck %s
+# RUN: llvm-objdump -td --no-show-raw-insn %t.32n | FileCheck %s
+# RUN: llvm-objdump -td --no-show-raw-insn %t.64n | FileCheck %s
+
+## Test the R_LARCH_ALIGN without symbol index.
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.o64.o --defsym=old=1
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.o64.o -o %t.o64
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.text2=0x20000 -e 0 %t.o64.o --no-relax -o %t.o64n
+# RUN: llvm-objdump -td --no-show-raw-insn %t.o64 | FileCheck %s
+# RUN: llvm-objdump -td --no-show-raw-insn %t.o64n | FileCheck %s
+
+## -r keeps section contents unchanged.
+# RUN: ld.lld -r %t.64.o -o %t.64.r
+# RUN: llvm-objdump -dr --no-show-raw-insn %t.64.r | FileCheck %s --check-prefix=CHECKR
+
+# CHECK-DAG: {{0*}}10000 l .text  {{0*}}44 .Ltext_start
+# CHECK-DAG: {{0*}}10038 l .text  {{0*}}0c .L1
+# CHECK-DAG: {{0*}}10040 l .text  {{0*}}04 .L2
+# CHECK-DAG: {{0*}}20000 l .text2 {{0*}}14 .Ltext2_start
+
+# CHECK:      <.Ltext_start>:
+# CHECK-NEXT:   break 1
+# CHECK-NEXT:   break 2
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   break 3
+# CHECK-NEXT:   break 4
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   pcalau12i     $a0, 0
+# CHECK-NEXT:   addi.{{[dw]}} $a0, $a0, 0
+# CHECK-NEXT:   pcalau12i     $a0, 0
+# CHECK-NEXT:   addi.{{[dw]}} $a0, $a0, 56
+# CHECK-NEXT:   pcalau12i     $a0, 0
+# CHECK-NEXT:   addi.{{[dw]}} $a0, $a0, 64
+# CHECK-EMPTY:
+# CHECK-NEXT: <.L1>:
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-EMPTY:
+# CHECK-NEXT: <.L2>:
+# CHECK-NEXT:   break 5
+
+# CHECK:      <.Ltext2_start>:
+# CHECK-NEXT:   pcalau12i     $a0, 0
+# CHECK-NEXT:   addi.{{[dw]}} $a0, $a0, 0
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   break 6
+
+# CHECKR:      <.Ltext2_start>:
+# CHECKR-NEXT:   pcalau12i $a0, 0
+# CHECKR-NEXT:   {{0*}}00: R_LARCH_PCALA_HI20 .Ltext2_start
+# CHECKR-NEXT:   {{0*}}00: R_LARCH_RELAX      *ABS*
+# CHECKR-NEXT:   addi.d    $a0, $a0, 0
+# CHECKR-NEXT:   {{0*}}04: R_LARCH_PCALA_LO12 .Ltext2_start
+# CHECKR-NEXT:   {{0*}}04: R_LARCH_RELAX      *ABS*
+# CHECKR-NEXT:   nop
+# CHECKR-NEXT:   {{0*}}08: R_LARCH_ALIGN      .Lalign_symbol+0x4
+# CHECKR-NEXT:   nop
+# CHECKR-NEXT:   nop
+# CHECKR-NEXT:   break 6
+
+.macro .fake_p2align_4 max=0
+  .ifdef old
+    .if \max==0
+      .reloc ., R_LARCH_ALIGN, 0xc
+      nop; nop; nop
+    .endif
+  .else
+    .reloc ., R_LARCH_ALIGN, .Lalign_symbol + 0x4 + (\max << 8)
+    nop; nop; nop
+  .endif
+.endm
+
+  .text
+.Lalign_symbol:
+.Ltext_start:
+  break 1
+  break 2
+## +0x8: Emit 2 nops, delete 1 nop.
+  .fake_p2align_4
+
+  break 3
+## +0x14: Emit 3 nops > 8 bytes, not emit.
+  .fake_p2align_4 8
+
+  break 4
+  .fake_p2align_4 8
+## +0x18: Emit 2 nops <= 8 bytes.
+
+## Compensate
+.ifdef old
+  nop; nop
+.endif
+
+## +0x20: Test symbol value and symbol size can be handled.
+  la.pcrel $a0, .Ltext_start
+  la.pcrel $a0, .L1
+  la.pcrel $a0, .L2
+
+## +0x38: Emit 2 nops, delete 1 nop.
+.L1:
+  .fake_p2align_4
+.L2:
+  break 5
+  .size .L1, . - .L1
+  .size .L2, . - .L2
+  .size .Ltext_start, . - .Ltext_start
+
+## Test another text section.
+  .section .text2,"ax",@progbits
+.Ltext2_start:
+  la.pcrel $a0, .Ltext2_start
+  .fake_p2align_4
+  break 6
+  .size .Ltext2_start, . - .Ltext2_start
diff --git a/lld/test/ELF/loongarch-relax-emit-relocs.s b/lld/test/ELF/loongarch-relax-emit-relocs.s
new file mode 100644
index 00000000000000..581fce8c95caa4
--- /dev/null
+++ b/lld/test/ELF/loongarch-relax-emit-relocs.s
@@ -0,0 +1,49 @@
+# REQUIRES: loongarch
+## Test that we can handle --emit-relocs while relaxing.
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax %s -o %t.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.64.o
+# RUN: ld.lld -Ttext=0x10000 --emit-relocs %t.32.o -o %t.32
+# RUN: ld.lld -Ttext=0x10000 --emit-relocs %t.64.o -o %t.64
+# RUN: llvm-objdump -dr %t.32 | FileCheck %s
+# RUN: llvm-objdump -dr %t.64 | FileCheck %s
+
+## -r should keep original relocations.
+# RUN: ld.lld -r %t.64.o -o %t.64.r
+# RUN: llvm-objdump -dr %t.64.r | FileCheck %s --check-prefix=CHECKR
+
+## --no-relax should keep original relocations.
+## TODO Due to R_LARCH_RELAX is not relaxed, it plays same as --relax now.
+# RUN: ld.lld -Ttext=0x10000 --emit-relocs --no-relax %t.64.o -o %t.64.norelax
+# RUN: llvm-objdump -dr %t.64.norelax | FileCheck %s
+
+# CHECK:      00010000 <_start>:
+# CHECK-NEXT:   pcalau12i $a0, 0
+# CHECK-NEXT:     R_LARCH_PCALA_HI20 _start
+# CHECK-NEXT:     R_LARCH_RELAX *ABS*
+# CHECK-NEXT:   addi.{{[dw]}} $a0, $a0, 0
+# CHECK-NEXT:     R_LARCH_PCALA_LO12 _start
+# CHECK-NEXT:     R_LARCH_RELAX *ABS*
+# CHECK-NEXT:   nop
+# CHECK-NEXT:     R_LARCH_ALIGN .Lla-relax-align0+0x4
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   ret
+
+# CHECKR:      <_start>:
+# CHECKR-NEXT:   pcalau12i $a0, 0
+# CHECKR-NEXT:     R_LARCH_PCALA_HI20 _start
+# CHECKR-NEXT:     R_LARCH_RELAX *ABS*
+# CHECKR-NEXT:   addi.d $a0, $a0, 0
+# CHECKR-NEXT:     R_LARCH_PCALA_LO12 _start
+# CHECKR-NEXT:     R_LARCH_RELAX *ABS*
+# CHECKR-NEXT:   nop
+# CHECKR-NEXT:     R_LARCH_ALIGN .Lla-relax-align0+0x4
+# CHECKR-NEXT:   nop
+# CHECKR-NEXT:   nop
+# CHECKR-NEXT:   ret
+
+.global _start
+_start:
+  la.pcrel $a0, _start
+  .p2align 4
+  ret
diff --git a/lld/test/ELF/loongarch-reloc-leb128.s b/lld/test/ELF/loongarch-reloc-leb128.s
new file mode 100644
index 00000000000000..2dd327d1564ebd
--- /dev/null
+++ b/lld/test/ELF/loongarch-reloc-leb128.s
@@ -0,0 +1,102 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax a.s -o a.o
+# RUN: llvm-readobj -r -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a.o | FileCheck %s --check-prefix=REL
+# RUN: ld.lld -shared --gc-sections a.o -o a.so
+# RUN: llvm-readelf -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a.so | FileCheck %s
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax a.s -o a32.o
+# RUN: llvm-readobj -r -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a32.o | FileCheck %s --check-prefix=REL
+# RUN: ld.lld -shared --gc-sections a32.o -o a32.so
+# RUN: llvm-readelf -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a32.so | FileCheck %s
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax extraspace.s -o extraspace32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax extraspace.s -o extraspace64.o
+# RUN: not ld.lld -shared extraspace32.o 2>&1 | FileCheck %s --check-prefix=ERROR
+# RUN: not ld.lld -shared extraspace64.o 2>&1 | FileCheck %s --check-prefix=ERROR
+# ERROR: error: extraspace{{.*}}.o:(.rodata+0x0): extra space for uleb128
+
+#--- a.s
+.cfi_startproc
+.cfi_lsda 0x1b,.LLSDA0
+.cfi_endproc
+
+.section .text.w,"axR"
+break 0; break 0; break 0; w1:
+  .p2align 4    # 4 bytes after relaxation
+w2: break 0
+
+.section .text.x,"ax"
+break 0; break 0; break 0; x1:
+  .p2align 4    # 4 bytes after relaxation
+x2: break 0
+
+.section .gcc_except_table,"a"
+.LLSDA0:
+.uleb128 w2-w1+116                   # initial value: 0x0080
+.uleb128 w1-w2+141                   # initial value: 0x0080
+.uleb128 w2-w1+16372                 # initial value: 0x008080
+.uleb128 w1-w2+16397                 # initial value: 0x008080
+.uleb128 w2-w1+2097140               # initial value: 0x00808080
+.uleb128 w1-w2+2097165               # initial value: 0x00808080
+
+.section .debug_rnglists
+.uleb128 w2-w1+116                   # initial value: 0x0080
+.uleb128 w1-w2+141                   # initial value: 0x0080
+.uleb128 w2-w1+16372                 # initial value: 0x008080
+.uleb128 w1-w2+16397                 # initial value: 0x008080
+.uleb128 w2-w1+2097140               # initial value: 0x00808080
+.uleb128 w1-w2+2097165               # initial value: 0x00808080
+
+.section .debug_loclists
+.uleb128 x2-x1                       # references discarded symbols
+
+# REL:      Section ({{.*}}) .rela.debug_rnglists {
+# REL-NEXT:   0x0 R_LARCH_ADD_ULEB128 w2 0x74
+# REL-NEXT:   0x0 R_LARCH_SUB_ULEB128 w1 0x0
+# REL-NEXT:   0x2 R_LARCH_ADD_ULEB128 w1 0x8D
+# REL-NEXT:   0x2 R_LARCH_SUB_ULEB128 w2 0x0
+# REL-NEXT:   0x4 R_LARCH_ADD_ULEB128 w2 0x3FF4
+# REL-NEXT:   0x4 R_LARCH_SUB_ULEB128 w1 0x0
+# REL-NEXT:   0x7 R_LARCH_ADD_ULEB128 w1 0x400D
+# REL-NEXT:   0x7 R_LARCH_SUB_ULEB128 w2 0x0
+# REL-NEXT:   0xA R_LARCH_ADD_ULEB128 w2 0x1FFFF4
+# REL-NEXT:   0xA R_LARCH_SUB_ULEB128 w1 0x0
+# REL-NEXT:   0xE R_LARCH_ADD_ULEB128 w1 0x20000D
+# REL-NEXT:   0xE R_LARCH_SUB_ULEB128 w2 0x0
+# REL-NEXT: }
+# REL:      Section ({{.*}}) .rela.debug_loclists {
+# REL-NEXT:   0x0 R_LARCH_ADD_ULEB128 x2 0x0
+# REL-NEXT:   0x0 R_LARCH_SUB_ULEB128 x1 0x0
+# REL-NEXT: }
+
+# REL:      Hex dump of section '.gcc_except_table':
+# REL-NEXT: 0x00000000 80008000 80800080 80008080 80008080 .
+# REL-NEXT: 0x00000010 8000                                .
+# REL:      Hex dump of section '.debug_rnglists':
+# REL-NEXT: 0x00000000 80008000 80800080 80008080 80008080 .
+# REL-NEXT: 0x00000010 8000                                .
+# REL:      Hex dump of section '.debug_loclists':
+# REL-NEXT: 0x00000000 00                                  .
+
+# CHECK:      Hex dump of section '.gcc_except_table':
+# CHECK-NEXT: 0x[[#%x,]] f8008901 f8ff0089 8001f8ff ff008980 .
+# CHECK-NEXT: 0x[[#%x,]] 8001                                .
+# CHECK:      Hex dump of section '.debug_rnglists':
+# CHECK-NEXT: 0x00000000 f8008901 f8ff0089 8001f8ff ff008980 .
+# CHECK-NEXT: 0x00000010 8001                                .
+# CHECK:      Hex dump of section '.debug_loclists':
+# CHECK-NEXT: 0x00000000 00                                  .
+
+#--- extraspace.s
+.text
+w1:
+  la.pcrel $t0, w1
+w2:
+
+.rodata
+.reloc ., R_LARCH_ADD_ULEB128, w2
+.reloc ., R_LARCH_SUB_ULEB128, w1
+.fill 10, 1, 0x80
+.byte 1
diff --git a/lld/test/ELF/lto/resolution-err.ll b/lld/test/ELF/lto/resolution-err.ll
index 6dfa64b1b8b9ee..f9855abaff3279 100644
--- a/lld/test/ELF/lto/resolution-err.ll
+++ b/lld/test/ELF/lto/resolution-err.ll
@@ -1,5 +1,5 @@
 ; UNSUPPORTED: system-windows
-; REQUIRES: shell
+; REQUIRES: shell, non-root-user
 ; RUN: llvm-as %s -o %t.bc
 ; RUN: touch %t.resolution.txt
 ; RUN: chmod u-w %t.resolution.txt
diff --git a/lld/test/ELF/lto/systemz.ll b/lld/test/ELF/lto/systemz.ll
new file mode 100644
index 00000000000000..42bf4e32fb6d75
--- /dev/null
+++ b/lld/test/ELF/lto/systemz.ll
@@ -0,0 +1,18 @@
+; REQUIRES: systemz
+;; Test we can infer the e_machine value EM_S390 from a bitcode file.
+
+; RUN: llvm-as %s -o %t.o
+; RUN: ld.lld %t.o -o %t
+; RUN: llvm-readobj -h %t | FileCheck %s
+
+; CHECK: Class: 64-bit
+; CHECK: DataEncoding: BigEndian
+; CHECK: Machine: EM_S390
+
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+target triple = "s390x-unknown-linux-gnu"
+
+define void @_start() {
+entry:
+  ret void
+}
diff --git a/lld/test/ELF/lto/thinlto-cant-write-index.ll b/lld/test/ELF/lto/thinlto-cant-write-index.ll
index e664acbb17de1a..286fcddd4238a1 100644
--- a/lld/test/ELF/lto/thinlto-cant-write-index.ll
+++ b/lld/test/ELF/lto/thinlto-cant-write-index.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 
 ; Basic ThinLTO tests.
 ; RUN: opt -module-summary %s -o %t1.o
diff --git a/lld/test/ELF/lto/thinlto-emit-imports.ll b/lld/test/ELF/lto/thinlto-emit-imports.ll
index 6d0e1e65047db4..253ec08619c982 100644
--- a/lld/test/ELF/lto/thinlto-emit-imports.ll
+++ b/lld/test/ELF/lto/thinlto-emit-imports.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 ;; Test a few properties not tested by thinlto-index-only.ll
 
 ; RUN: opt -module-summary %s -o %t1.o
diff --git a/lld/test/ELF/mips-pc-relocs.s b/lld/test/ELF/mips-pc-relocs.s
index 5e7dbed94ca7c4..7d23f9d7469a48 100644
--- a/lld/test/ELF/mips-pc-relocs.s
+++ b/lld/test/ELF/mips-pc-relocs.s
@@ -40,11 +40,13 @@ __start:
 #                                         ^-- (0x20020-0x20000)>>2
 # CHECK-NEXT:    20004:       beqc    $5, $6, 0x20020
 #                                             ^-- (0x20020-4-0x20004)>>2
-# CHECK-NEXT:    20008:       beqzc   $9, 0x20020
-#                                         ^-- (0x20020-4-0x20008)>>2
-# CHECK-NEXT:    2000c:       bc      0x20020
-#                                     ^-- (0x20020-4-0x2000c)>>2
-# CHECK-NEXT:    20010:       aluipc  $2, 0
-#                                         ^-- %hi(0x20020-0x20010)
-# CHECK-NEXT:    20014:       addiu   $2, $2, 12
-#                                             ^-- %lo(0x20020-0x20014)
+# CHECK-NEXT:    20008:       nop
+# CHECK-NEXT:    2000c:       beqzc   $9, 0x20020
+#                                         ^-- (0x20020-4-0x2000c)>>2
+# CHECK-NEXT:    20010:       nop
+# CHECK-NEXT:    20014:       bc      0x20020
+#                                     ^-- (0x20020-4-0x200014)>>2
+# CHECK-NEXT:    20018:       aluipc  $2, 0
+#                                         ^-- %hi(0x20020-0x20018)
+# CHECK-NEXT:    2001c:       addiu   $2, $2, 4
+#                                             ^-- %lo(0x20020-0x2001c)
diff --git a/lld/test/ELF/riscv-tlsdesc-gd-mixed.s b/lld/test/ELF/riscv-tlsdesc-gd-mixed.s
new file mode 100644
index 00000000000000..c0e91593ed9686
--- /dev/null
+++ b/lld/test/ELF/riscv-tlsdesc-gd-mixed.s
@@ -0,0 +1,26 @@
+# REQUIRES: riscv
+# RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.o
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readobj -r %t.so | FileCheck %s --check-prefix=RELA
+
+## Both TLSDESC and DTPMOD64/DTPREL64 should be present.
+# RELA:      .rela.dyn {
+# RELA-NEXT:   0x[[#%X,ADDR:]] R_RISCV_TLSDESC      a 0x0
+# RELA-NEXT:   0x[[#ADDR+16]]  R_RISCV_TLS_DTPMOD64 a 0x0
+# RELA-NEXT:   0x[[#ADDR+24]]  R_RISCV_TLS_DTPREL64 a 0x0
+# RELA-NEXT: }
+
+  la.tls.gd a0,a
+  call __tls_get_addr@plt
+
+.Ltlsdesc_hi0:
+  auipc a2, %tlsdesc_hi(a)
+  ld    a3, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a2)
+  addi  a0, a2, %tlsdesc_add_lo(.Ltlsdesc_hi0)
+  jalr  t0, 0(a3), %tlsdesc_call(.Ltlsdesc_hi0)
+
+.section .tbss,"awT",@nobits
+.globl a
+.zero 8
+a:
+.zero 4
diff --git a/lld/test/ELF/riscv-tlsdesc-relax.s b/lld/test/ELF/riscv-tlsdesc-relax.s
new file mode 100644
index 00000000000000..fb24317e6535ca
--- /dev/null
+++ b/lld/test/ELF/riscv-tlsdesc-relax.s
@@ -0,0 +1,189 @@
+# REQUIRES: riscv
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=riscv64 --defsym PAD=0 -mattr=+c,+relax a.s -o a.64.o
+# RUN: llvm-mc -filetype=obj -triple=riscv64 --defsym PAD=5000 -mattr=+c,+relax a.s -o aa.64.o
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c,+relax c.s -o c.64.o
+# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so -z separate-code
+# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.so | FileCheck %s --check-prefix=GD64
+
+## Test the TLSDESC to LE optimization. Also check --emit-relocs.
+# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le -z separate-code --emit-relocs
+# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -hdr a.64.le | FileCheck %s --check-prefix=LE64
+# RUN: ld.lld -e 0 -z now aa.64.o c.64.o -o aa.64.le -z separate-code
+# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d aa.64.le | FileCheck %s --check-prefix=LE64A
+
+## Test the TLSDESC to IE optimization.
+# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie -z separate-code
+# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.ie | FileCheck %s --check-prefix=IE64
+
+# GD64:      .got     00000018 00000000000020c0
+# GD64-LABEL: <_start>:
+# GD64-NEXT:         jal     {{.*}} <foo>
+# GD64-LABEL: <foo>:
+## &.got[c]-. = 0x20c0+8 - 0x1004 = 0x10c4
+# GD64:        1004: auipc   a2, 0x1
+# GD64-NEXT:         c.add   a7, a7
+# GD64-NEXT:         ld      a3, 0xc4(a2)
+# GD64-NEXT:         c.add   a7, a7
+# GD64-NEXT:         addi    a0, a2, 0xc4
+# GD64-NEXT:         c.add   a7, a7
+# GD64-NEXT:         jalr    t0, 0x0(a3)
+# GD64-NEXT:         c.add   a0, tp
+# GD64-NEXT:         jal     {{.*}} <foo>
+## &.got[c]-. = 0x20c0+8 - 0x1020 = 0x10a8
+# GD64-NEXT:   1020: auipc   a4, 0x1
+# GD64-NEXT:         ld      a5, 0xa8(a4)
+# GD64-NEXT:         addi    a0, a4, 0xa8
+# GD64-NEXT:         jalr    t0, 0x0(a5)
+# GD64-NEXT:         c.add   a0, tp
+## &.got[c]-. = 0x20c0+8 - 0x1032 = 0x1096
+# GD64-NEXT:   1032: auipc   a6, 0x1
+# GD64-NEXT:         ld      a7, 0x96(a6)
+# GD64-NEXT:         addi    a0, a6, 0x96
+# GD64-NEXT:         jalr    t0, 0x0(a7)
+# GD64-NEXT:         c.add   a0, tp
+
+# LE64-LABEL: <_start>:
+# LE64-NEXT:         jal     {{.*}} <foo>
+# LE64-LABEL: <foo>:
+# LE64-NEXT:         c.add   a7, a7
+# LE64-NEXT:                 R_RISCV_TLSDESC_HI20 b
+# LE64-NEXT:                 R_RISCV_RELAX *ABS*
+# LE64-NEXT:         c.add   a7, a7
+# LE64-NEXT:                 R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi0
+# LE64-NEXT:                 R_RISCV_RELAX *ABS*
+# LE64-NEXT:  11008: c.add   a7, a7
+# LE64-NEXT:                 R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi0
+# LE64-NEXT:                 R_RISCV_RELAX *ABS*
+# LE64-NEXT:         addi    a0, zero, 0x7ff
+# LE64-NEXT:                 R_RISCV_TLSDESC_CALL .Ltlsdesc_hi0
+# LE64-NEXT:                 R_RISCV_RELAX *ABS*
+# LE64-NEXT:         c.add   a0, tp
+# LE64-NEXT:         jal     {{.*}} <foo>
+# LE64-NEXT:                 R_RISCV_JAL foo
+# LE64-NEXT:                 R_RISCV_RELAX *ABS*
+# LE64-NEXT:         addi    a0, zero, 0x7ff
+# LE64-NEXT:                 R_RISCV_TLSDESC_HI20 b
+# LE64-NEXT:                 R_RISCV_RELAX *ABS*
+# LE64-NEXT:                 R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi1
+# LE64-NEXT:                 R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi1
+# LE64-NEXT:                 R_RISCV_TLSDESC_CALL .Ltlsdesc_hi1
+# LE64-NEXT:         c.add   a0, tp
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:                 R_RISCV_TLSDESC_HI20 b
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:                 R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi2
+# LE64-NEXT:                 R_RISCV_RELAX *ABS*
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:                 R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi2
+# LE64-NEXT:                 R_RISCV_RELAX *ABS*
+# LE64-NEXT:         addi    a0, zero, 0x7ff
+# LE64-NEXT:                 R_RISCV_TLSDESC_CALL .Ltlsdesc_hi2
+# LE64-NEXT:         c.add   a0, tp
+
+# LE64A-LABEL: <_start>:
+# LE64A-NEXT:         jal     {{.*}} <foo>
+# LE64A-LABEL: <foo>:
+# LE64A-NEXT:         c.add   a7, a7
+# LE64A-NEXT:         c.add   a7, a7
+# LE64A-NEXT:  11008: lui     a0, 0x2
+# LE64A-NEXT:         c.add   a7, a7
+# LE64A-NEXT:         addi    a0, a0, -0x479
+# LE64A-NEXT:         c.add   a0, tp
+# LE64A-NEXT:         jal     {{.*}} <foo>
+# LE64A-NEXT:         lui     a0, 0x2
+# LE64A-NEXT:         addi    a0, a0, -0x479
+# LE64A-NEXT:         c.add   a0, tp
+# LE64A-NEXT:         addi    zero, zero, 0x0
+# LE64A-NEXT:         addi    zero, zero, 0x0
+# LE64A-NEXT:         lui     a0, 0x2
+# LE64A-NEXT:         addi    a0, a0, -0x479
+# LE64A-NEXT:         c.add   a0, tp
+
+# IE64:       .got     00000010 00000000000120e0
+# IE64-LABEL: <_start>:
+# IE64-NEXT:         jal     {{.*}} <foo>
+# IE64-LABEL: <foo>:
+# IE64-NEXT:         c.add   a7, a7
+# IE64-NEXT:         c.add   a7, a7
+## &.got[c]-. = 0x120e0+8 - 0x11008 = 0x10e0
+# IE64-NEXT:  11008: auipc   a0, 0x1
+# IE64-NEXT:         c.add   a7, a7
+# IE64-NEXT:         ld      a0, 0xe0(a0)
+# IE64-NEXT:         c.add   a0, tp
+# IE64-NEXT:         jal     {{.*}} <foo>
+## &.got[c]-. = 0x120e0+8 - 0x11018 = 0x10d0
+# IE64-NEXT:  11018: auipc   a0, 0x1
+# IE64-NEXT:         ld      a0, 0xd0(a0)
+# IE64-NEXT:         c.add   a0, tp
+## &.got[c]-. = 0x120e0+8 - 0x1102a = 0x10be
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:  1102a: auipc   a0, 0x1
+# IE64-NEXT:         ld      a0, 0xbe(a0)
+# IE64-NEXT:         c.add   a0, tp
+
+#--- a.s
+.globl _start
+_start:
+.balign 16
+  call foo
+
+foo:
+.Ltlsdesc_hi0:
+.option norelax
+## All 4 instructions have an R_RISCV_RELAX.
+## Check that optimization/relaxation are not affected by irrelevant instructions.
+  auipc a2, %tlsdesc_hi(b)
+  .reloc .-4, R_RISCV_RELAX, 0
+  c.add a7, a7
+  ld    a3, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a2)
+  .reloc .-4, R_RISCV_RELAX, 0
+  c.add a7, a7
+  addi  a0, a2, %tlsdesc_add_lo(.Ltlsdesc_hi0)
+  .reloc .-4, R_RISCV_RELAX, 0
+  c.add a7, a7
+  jalr  t0, 0(a3), %tlsdesc_call(.Ltlsdesc_hi0)
+  .reloc .-4, R_RISCV_RELAX, 0
+  add   a0, a0, tp
+.option relax
+
+  call foo
+
+.Ltlsdesc_hi1:
+.option norelax
+## AUIPC has an R_RISCV_RELAX. We perform relaxation, ignoring whether other
+## instructions have R_RISCV_RELAX.
+  auipc a4, %tlsdesc_hi(b)
+  .reloc .-4, R_RISCV_RELAX, 0
+  ld    a5, %tlsdesc_load_lo(.Ltlsdesc_hi1)(a4)
+  addi  a0, a4, %tlsdesc_add_lo(.Ltlsdesc_hi1)
+  jalr  t0, 0(a5), %tlsdesc_call(.Ltlsdesc_hi1)
+  add   a0, a0, tp
+.option relax
+
+.Ltlsdesc_hi2:
+.option norelax
+## AUIPC does not have R_RISCV_RELAX. No relaxation.
+  auipc a6, %tlsdesc_hi(b)
+  ld    a7, %tlsdesc_load_lo(.Ltlsdesc_hi2)(a6)
+  .reloc .-4, R_RISCV_RELAX, 0
+  addi  a0, a6, %tlsdesc_add_lo(.Ltlsdesc_hi2)
+  .reloc .-4, R_RISCV_RELAX, 0
+  jalr  t0, 0(a7), %tlsdesc_call(.Ltlsdesc_hi2)
+  add   a0, a0, tp
+.option relax
+
+.section .tbss
+.globl a
+.zero 8
+a:
+.zero 2039+PAD  ## Place b at 0x7ff+PAD
+
+#--- c.s
+.tbss
+.globl b
+b:
+.zero 4
diff --git a/lld/test/ELF/riscv-tlsdesc.s b/lld/test/ELF/riscv-tlsdesc.s
new file mode 100644
index 00000000000000..1738f86256caa6
--- /dev/null
+++ b/lld/test/ELF/riscv-tlsdesc.s
@@ -0,0 +1,194 @@
+# REQUIRES: riscv
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=riscv64 a.s -o a.64.o
+# RUN: llvm-mc -filetype=obj -triple=riscv64 c.s -o c.64.o
+# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so
+# RUN: llvm-mc -filetype=obj -triple=riscv32 --defsym ELF32=1 a.s -o a.32.o
+# RUN: llvm-mc -filetype=obj -triple=riscv32 --defsym ELF32=1 c.s -o c.32.o
+# RUN: ld.lld -shared -soname=c.32.so c.32.o -o c.32.so
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so
+# RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s
+# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.so | FileCheck %s --check-prefix=GD64
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o rel.64.so -z rel
+# RUN: llvm-readobj -r -x .got rel.64.so | FileCheck --check-prefix=GD64-REL %s
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le
+# RUN: llvm-readelf -r a.64.le | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.le | FileCheck %s --check-prefix=LE64
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie
+# RUN: llvm-readobj -r a.64.ie | FileCheck --check-prefix=IE64-RELA %s
+# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.64.ie | FileCheck %s --check-prefix=IE64
+
+## 32-bit code is mostly the same. We only test a few variants. The IE optimization uses the LW instruction.
+
+# RUN: ld.lld -shared -z now a.32.o c.32.o -o rel.32.so -z rel
+# RUN: llvm-readobj -r -x .got rel.32.so | FileCheck --check-prefix=GD32-REL %s
+# RUN: ld.lld -e 0 -z now a.32.o c.32.so -o a.32.ie
+# RUN: llvm-objdump --no-show-raw-insn -M no-aliases -h -d a.32.ie | FileCheck %s --check-prefix=IE32
+
+# GD64-RELA:      .rela.dyn {
+# GD64-RELA-NEXT:   0x2408 R_RISCV_TLSDESC - 0x7FF
+# GD64-RELA-NEXT:   0x23E8 R_RISCV_TLSDESC a 0x0
+# GD64-RELA-NEXT:   0x23F8 R_RISCV_TLSDESC c 0x0
+# GD64-RELA-NEXT: }
+# GD64-RELA:      Hex dump of section '.got':
+# GD64-RELA-NEXT: 0x000023e0 20230000 00000000 00000000 00000000 #
+# GD64-RELA-NEXT: 0x000023f0 00000000 00000000 00000000 00000000 .
+
+# GD64-REL:      .rel.dyn {
+# GD64-REL-NEXT:   0x23F0 R_RISCV_TLSDESC -
+# GD64-REL-NEXT:   0x23D0 R_RISCV_TLSDESC a
+# GD64-REL-NEXT:   0x23E0 R_RISCV_TLSDESC c
+# GD64-REL-NEXT: }
+# GD64-REL:      Hex dump of section '.got':
+# GD64-REL-NEXT: 0x000023c8 08230000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000023d8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000023e8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000023f8 ff070000 00000000                   .
+
+# GD32-REL:      .rel.dyn {
+# GD32-REL-NEXT:   0x2274 R_RISCV_TLSDESC -
+# GD32-REL-NEXT:   0x2264 R_RISCV_TLSDESC a
+# GD32-REL-NEXT:   0x226C R_RISCV_TLSDESC c
+# GD32-REL-NEXT: }
+# GD32-REL:      Hex dump of section '.got':
+# GD32-REL-NEXT: 0x00002260 00220000 00000000 00000000 00000000 .
+# GD32-REL-NEXT: 0x00002270 00000000 00000000 ff070000          .
+
+# GD64:      .got     00000038 00000000000023e0
+
+## &.got[a]-. = 0x23e0+8 - 0x12e0 = 0x1108
+# GD64:        12e0: auipc   a0, 0x1
+# GD64-NEXT:         ld      a1, 0x108(a0)
+# GD64-NEXT:         addi    a0, a0, 0x108
+# GD64-NEXT:         jalr    t0, 0x0(a1)
+# GD64-NEXT:         add     a0, a0, tp
+
+## &.got[b]-. = 0x23e0+40 - 0x12f4 = 0x1114
+# GD64-NEXT:   12f4: auipc   a2, 0x1
+# GD64-NEXT:         ld      a3, 0x114(a2)
+# GD64-NEXT:         addi    a0, a2, 0x114
+# GD64-NEXT:         jalr    t0, 0x0(a3)
+# GD64-NEXT:         add     a0, a0, tp
+
+## &.got[c]-. = 0x23e0+24 - 0x1308 = 0x10f0
+# GD64-NEXT:   1308: auipc   a4, 0x1
+# GD64-NEXT:         ld      a5, 0xf0(a4)
+# GD64-NEXT:         addi    a0, a4, 0xf0
+# GD64-NEXT:         jalr    t0, 0x0(a5)
+# GD64-NEXT:         add     a0, a0, tp
+
+# NOREL: no relocations
+
+# LE64-LABEL: <.text>:
+## st_value(a) = 8
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:         addi    a0, zero, 0x8
+# LE64-NEXT:         add     a0, a0, tp
+## st_value(b) = 2047
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:         addi    a0, zero, 0x7ff
+# LE64-NEXT:         add     a0, a0, tp
+## st_value(c) = 2048
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:         addi    zero, zero, 0x0
+# LE64-NEXT:         lui     a0, 0x1
+# LE64-NEXT:         addi    a0, a0, -0x800
+# LE64-NEXT:         add     a0, a0, tp
+
+# IE64-RELA:      .rela.dyn {
+# IE64-RELA-NEXT:   0x123B0 R_RISCV_TLS_TPREL64 c 0x0
+# IE64-RELA-NEXT: }
+
+# IE64:       .got     00000010 00000000000123a8
+
+## a and b are optimized to use LE. c is optimized to IE.
+# IE64-LABEL: <.text>:
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:         addi    a0, zero, 0x8
+# IE64-NEXT:         add     a0, a0, tp
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:         addi    a0, zero, 0x7ff
+# IE64-NEXT:         add     a0, a0, tp
+## &.got[c]-. = 0x123a8+8 - 0x112b8 = 0x10f8
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:         addi    zero, zero, 0x0
+# IE64-NEXT:  112b8: auipc   a0, 0x1
+# IE64-NEXT:         ld      a0, 0xf8(a0)
+# IE64-NEXT:         add     a0, a0, tp
+
+# IE32:       .got     00000008 00012248
+
+# IE32-LABEL: <.text>:
+## st_value(a) = 8
+# IE32-NEXT:         addi    zero, zero, 0x0
+# IE32-NEXT:         addi    zero, zero, 0x0
+# IE32-NEXT:         addi    zero, zero, 0x0
+# IE32-NEXT:         addi    a0, zero, 0x8
+# IE32-NEXT:         add     a0, a0, tp
+## st_value(b) = 2047
+# IE32-NEXT:         addi    zero, zero, 0x0
+# IE32-NEXT:         addi    zero, zero, 0x0
+# IE32-NEXT:         addi    zero, zero, 0x0
+# IE32-NEXT:         addi    a0, zero, 0x7ff
+# IE32-NEXT:         add     a0, a0, tp
+## &.got[c]-. = 0x12248+4 - 0x111cc = 0x1080
+# IE32-NEXT:         addi    zero, zero, 0x0
+# IE32-NEXT:         addi    zero, zero, 0x0
+# IE32-NEXT:  111cc: auipc   a0, 0x1
+# IE32-NEXT:         lw      a0, 0x80(a0)
+# IE32-NEXT:         add     a0, a0, tp
+
+#--- a.s
+.macro load dst, src
+.ifdef ELF32
+lw \dst, \src
+.else
+ld \dst, \src
+.endif
+.endm
+
+.Ltlsdesc_hi0:
+  auipc a0, %tlsdesc_hi(a)
+  load  a1, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a0)
+  addi  a0, a0, %tlsdesc_add_lo(.Ltlsdesc_hi0)
+  jalr  t0, 0(a1), %tlsdesc_call(.Ltlsdesc_hi0)
+  add   a0, a0, tp
+
+.Ltlsdesc_hi1:
+  auipc a2, %tlsdesc_hi(b)
+  load  a3, %tlsdesc_load_lo(.Ltlsdesc_hi1)(a2)
+  addi  a0, a2, %tlsdesc_add_lo(.Ltlsdesc_hi1)
+  jalr  t0, 0(a3), %tlsdesc_call(.Ltlsdesc_hi1)
+  add   a0, a0, tp
+
+.Ltlsdesc_hi2:
+  auipc a4, %tlsdesc_hi(c)
+  load  a5, %tlsdesc_load_lo(.Ltlsdesc_hi2)(a4)
+  addi  a0, a4, %tlsdesc_add_lo(.Ltlsdesc_hi2)
+  jalr  t0, 0(a5), %tlsdesc_call(.Ltlsdesc_hi2)
+  add   a0, a0, tp
+
+.section .tbss
+.globl a
+.zero 8
+a:
+.zero 2039  ## Place b at 0x7ff
+b:
+.zero 1
+
+#--- c.s
+.tbss
+.globl c
+c: .zero 4
diff --git a/lld/test/ELF/systemz-got.s b/lld/test/ELF/systemz-got.s
new file mode 100644
index 00000000000000..1d558aa3b02905
--- /dev/null
+++ b/lld/test/ELF/systemz-got.s
@@ -0,0 +1,16 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %p/Inputs/shared.s -o %t2.o
+# RUN: ld.lld -shared %t2.o -soname=%t2.so -o %t2.so
+
+# RUN: ld.lld -dynamic-linker /lib/ld64.so.1 %t.o %t2.so -o %t
+# RUN: llvm-readelf -S -r  %t | FileCheck %s
+
+# CHECK: .got              PROGBITS        {{.*}} {{.*}} 000020 00  WA  0   0  8
+
+# CHECK: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# CHECK: {{.*}}  000000010000000a R_390_GLOB_DAT         0000000000000000 bar + 0
+
+.global _start
+_start:
+	lgrl  %r1,bar@GOT
diff --git a/lld/test/ELF/systemz-gotent-relax-align.s b/lld/test/ELF/systemz-gotent-relax-align.s
new file mode 100644
index 00000000000000..c6326086f56db0
--- /dev/null
+++ b/lld/test/ELF/systemz-gotent-relax-align.s
@@ -0,0 +1,48 @@
+# REQUIRES: systemz
+## Verify that R_390_GOTENT optimization is not performed on misaligned symbols.
+
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t1
+# RUN: llvm-readelf -S -r -x .got -x .got.plt %t1 | FileCheck --check-prefixes=CHECK %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t1 | FileCheck --check-prefix=DISASM %s
+
+## We retain one .got entry for the unaligned symbol.
+# CHECK:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK:      .got              PROGBITS        00000000010021e0 0001e0 000020 00  WA  0   0  8
+# CHECK-NEXT: .relro_padding    NOBITS          0000000001002200 000200 000e00 00  WA  0   0  1
+# CHECK-NEXT: .data             PROGBITS        0000000001003200 000200 000006 00  WA  0   0  2
+
+# CHECK-LABEL: Hex dump of section '.got':
+# CHECK-NEXT:    0x010021e0 00000000 00000000 00000000 00000000
+# CHECK-NEXT:    0x010021f0 00000000 00000000 00000000 01003205
+
+# DISASM:      Disassembly of section .text:
+# DISASM:      <_start>:
+# DISASM-NEXT:   larl    %r1, 0x1003200
+# DISASM-NEXT:   larl    %r1, 0x1003200
+# DISASM-NEXT:   lgrl    %r1, 0x10021f8
+# DISASM-NEXT:   lgrl    %r1, 0x10021f8
+
+.data
+.globl var_align
+.hidden var_align
+ .align 2
+var_align:
+ .long 0
+
+.data
+.globl var_unalign
+.hidden var_unalign
+ .align 2
+ .byte 0
+var_unalign:
+ .byte 0
+
+.text
+.globl _start
+.type _start, @function
+_start:
+ lgrl %r1, var_align@GOT
+ lgrl %r1, var_align@GOT
+ lgrl %r1, var_unalign@GOT
+ lgrl %r1, var_unalign@GOT
diff --git a/lld/test/ELF/systemz-gotent-relax-und-dso.s b/lld/test/ELF/systemz-gotent-relax-und-dso.s
new file mode 100644
index 00000000000000..57369a417fd445
--- /dev/null
+++ b/lld/test/ELF/systemz-gotent-relax-und-dso.s
@@ -0,0 +1,68 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %S/Inputs/gotpc-relax-und-dso.s -o %tdso.o
+# RUN: ld.lld -shared %tdso.o -soname=t.so -o %t.so
+# RUN: ld.lld --hash-style=sysv -shared %t.o %t.so -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=RELOC %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck --check-prefix=DISASM %s
+
+# RELOC-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 3 entries:
+# RELOC: 00000000000023f8 000000010000000a R_390_GLOB_DAT 00000000000012d8 foo + 0
+# RELOC: 0000000000002400 000000030000000a R_390_GLOB_DAT 0000000000000000 und + 0
+# RELOC: 0000000000002408 000000040000000a R_390_GLOB_DAT 0000000000000000 dsofoo + 0
+
+# DISASM:      Disassembly of section .text:
+# DISASM-EMPTY:
+# DISASM-NEXT: <foo>:
+# DISASM-NEXT:     bc 0, 0
+# DISASM:      <hid>:
+# DISASM-NEXT:     bc 0, 0
+# DISASM:      <_start>:
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+
+.text
+.globl foo
+.type foo, @function
+foo:
+ nop
+
+.globl hid
+.hidden hid
+.type hid, @function
+hid:
+ nop
+
+.globl _start
+.type _start, @function
+_start:
+ lgrl %r1, und@GOT
+ lgrl %r1, und@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, und@GOT
+ lgrl %r1, und@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
diff --git a/lld/test/ELF/systemz-gotent-relax.s b/lld/test/ELF/systemz-gotent-relax.s
new file mode 100644
index 00000000000000..f665e1af9e53d2
--- /dev/null
+++ b/lld/test/ELF/systemz-gotent-relax.s
@@ -0,0 +1,91 @@
+# REQUIRES: systemz
+## Test R_390_GOTENT optimization.
+
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t1 --no-apply-dynamic-relocs
+# RUN: llvm-readelf -S -r -x .got.plt %t1 | FileCheck --check-prefixes=CHECK,NOAPPLY %s
+# RUN: ld.lld %t.o -o %t1 --apply-dynamic-relocs
+# RUN: llvm-readelf -S -r -x .got.plt %t1 | FileCheck --check-prefixes=CHECK,APPLY %s
+# RUN: ld.lld %t.o -o %t1
+# RUN: llvm-objdump --no-print-imm-hex -d %t1 | FileCheck --check-prefix=DISASM %s
+
+## --no-relax disables GOT optimization.
+# RUN: ld.lld --no-relax %t.o -o %t2
+# RUN: llvm-objdump --no-print-imm-hex -d %t2 | FileCheck --check-prefix=NORELAX %s
+
+## In our implementation, .got is retained even if all GOT-generating relocations are optimized.
+# CHECK:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK:      .iplt             PROGBITS        0000000001001240 000240 000020 00  AX  0   0 16
+# CHECK-NEXT: .got              PROGBITS        0000000001002260 000260 000018 00  WA  0   0  8
+# CHECK-NEXT: .relro_padding    NOBITS          0000000001002278 000278 000d88 00  WA  0   0  1
+# CHECK-NEXT: .got.plt          PROGBITS        0000000001003278 000278 000008 00  WA  0   0  8
+
+## There is one R_S390_IRELATIVE relocation.
+# CHECK-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# CHECK:       0000000001003278  000000000000003d R_390_IRELATIVE                   10011e8
+
+# CHECK-LABEL: Hex dump of section '.got.plt':
+# NOAPPLY-NEXT:  0x01003278 00000000 00000000
+# APPLY-NEXT:    0x01003278 00000000 010011e8
+
+# DISASM:      Disassembly of section .text:
+# DISASM: 00000000010011e0 <foo>:
+# DISASM-NEXT:   bc      0, 0
+# DISASM: 00000000010011e4 <hid>:
+# DISASM-NEXT:   bc      0, 0
+# DISASM: 00000000010011e8 <ifunc>:
+# DISASM-NEXT:   br      %r14
+# DISASM: 00000000010011ea <_start>:
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+
+# NORELAX-LABEL: <_start>:
+# NORELAX-COUNT-12: lgrl
+
+.text
+.globl foo
+
+.text
+.globl foo
+.type foo, @function
+foo:
+ nop
+
+.globl hid
+.hidden hid
+.type hid, @function
+hid:
+ nop
+
+.text
+.type ifunc STT_GNU_IFUNC
+.globl ifunc
+.type ifunc, @function
+ifunc:
+ br %r14
+
+.globl _start
+.type _start, @function
+_start:
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, ifunc@GOT
+ lgrl %r1, ifunc@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, ifunc@GOT
+ lgrl %r1, ifunc@GOT
diff --git a/lld/test/ELF/systemz-ifunc-nonpreemptible.s b/lld/test/ELF/systemz-ifunc-nonpreemptible.s
new file mode 100644
index 00000000000000..5056db302ca1c7
--- /dev/null
+++ b/lld/test/ELF/systemz-ifunc-nonpreemptible.s
@@ -0,0 +1,75 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-none-linux-gnu %s -o %t.o
+# RUN: ld.lld -static %t.o -o %t
+# RUN: ld.lld -static %t.o -o %t.apply --apply-dynamic-relocs
+# RUN: llvm-readelf --section-headers --relocations --symbols %t | FileCheck %s
+# RUN: llvm-readelf -x .got.plt %t | FileCheck %s --check-prefix=NO-APPLY-RELOC
+# RUN: llvm-readelf -x .got.plt %t.apply | FileCheck %s --check-prefix=APPLY-RELOC
+# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t | FileCheck %s --check-prefix=DISASM
+
+# CHECK:      Section Headers:
+# CHECK-NEXT:  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK-NEXT:  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK-NEXT:  [ 1] .rela.dyn         RELA            0000000001000158 000158 000030 18  AI  0   4  8
+# CHECK-NEXT:  [ 2] .text             PROGBITS        0000000001001188 000188 00001c 00  AX  0   0  4
+# CHECK-NEXT:  [ 3] .iplt             PROGBITS        00000000010011b0 0001b0 000040 00  AX  0   0 16
+# CHECK-NEXT:  [ 4] .got.plt          PROGBITS        00000000010021f0 0001f0 000010 00  WA  0   0  8
+
+# CHECK:      Relocation section '.rela.dyn' at offset 0x158 contains 2 entries:
+# CHECK-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# CHECK-NEXT: 00000000010021f0  000000000000003d R_390_IRELATIVE                   1001188
+# CHECK-NEXT: 00000000010021f8  000000000000003d R_390_IRELATIVE                   100118a
+
+# CHECK:      Symbol table '.symtab' contains 6 entries:
+# CHECK-NEXT:   Num:    Value          Size Type    Bind   Vis       Ndx Name
+# CHECK-NEXT:     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# CHECK-NEXT:     1: 0000000001000158     0 NOTYPE  LOCAL  HIDDEN      1 __rela_iplt_start
+# CHECK-NEXT:     2: 0000000001000188     0 NOTYPE  LOCAL  HIDDEN      1 __rela_iplt_end
+# CHECK-NEXT:     3: 0000000001001188     0 IFUNC   GLOBAL DEFAULT     2 foo
+# CHECK-NEXT:     4: 000000000100118a     0 IFUNC   GLOBAL DEFAULT     2 bar
+# CHECK-NEXT:     5: 000000000100118c     0 NOTYPE  GLOBAL DEFAULT     2 _start
+
+# NO-APPLY-RELOC-LABEL:  Hex dump of section '.got.plt':
+# NO-APPLY-RELOC-NEXT:     0x010021f0 00000000 00000000 00000000 00000000
+# NO-APPLY-RELOC-EMPTY:
+
+# APPLY-RELOC-LABEL:  Hex dump of section '.got.plt':
+# APPLY-RELOC-NEXT:     0x010021f0 00000000 01001188 00000000 0100118a
+# APPLY-RELOC-EMPTY:
+
+# DISASM: Disassembly of section .text:
+# DISASM: 0000000001001188 <foo>:
+# DISASM-NEXT:  br      %r14
+# DISASM: 000000000100118a <bar>:
+# DISASM-NEXT:  br      %r14
+# DISASM: 000000000100118c  <_start>:
+# DISASM-NEXT:  brasl   %r14, 0x10011b0
+# DISASM-NEXT:  brasl   %r14, 0x10011d0
+# DISASM-NEXT:  larl    %r2, 0x1000158
+# DISASM-NEXT:  larl    %r2, 0x1000188
+# DISASM: Disassembly of section .iplt:
+# DISASM: <.iplt>:
+# DISASM:        10011b0:       larl    %r1, 0x10021f0
+# DISASM-NEXT:   10011b6:       lg      %r1, 0(%r1)
+# DISASM-NEXT:   10011bc:       br      %r1
+# DISASM:        10011d0:       larl    %r1, 0x10021f8
+# DISASM-NEXT:   10011d6:       lg      %r1, 0(%r1)
+# DISASM-NEXT:   10011dc:       br      %r1
+
+.text
+.type foo STT_GNU_IFUNC
+.globl foo
+foo:
+ br %r14
+
+.type bar STT_GNU_IFUNC
+.globl bar
+bar:
+ br %r14
+
+.globl _start
+_start:
+ brasl %r14, foo@plt
+ brasl %r14, bar@plt
+ larl %r2, __rela_iplt_start
+ larl %r2, __rela_iplt_end
diff --git a/lld/test/ELF/systemz-init-padding.s b/lld/test/ELF/systemz-init-padding.s
new file mode 100644
index 00000000000000..c56b98d43f1b0e
--- /dev/null
+++ b/lld/test/ELF/systemz-init-padding.s
@@ -0,0 +1,27 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %p/Inputs/systemz-init.s -o systemz-init.o
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -dynamic-linker /lib/ld64.so.1 %t.o systemz-init.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn -j .init %t | FileCheck %s
+
+# glibc < 2.39 used to align .init and .fini code at a 4-byte boundary.
+# When that happens, the linker must not pad the code with invalid
+# instructions, e.g. null bytes.
+	.section        .init,"ax",@progbits
+	brasl %r14, startup
+
+# CHECK:      <.init>:
+# CHECK-NEXT: brasl %r14,
+# CHECK-NEXT: bcr     0, %r7
+# CHECK-NEXT: lg %r4, 272(%r15)
+
+	.text
+	.globl startup
+	.p2align 4
+startup:
+	br %r14
+
+	.globl main
+	.p2align 4
+main:
+	br %r14
diff --git a/lld/test/ELF/systemz-pie.s b/lld/test/ELF/systemz-pie.s
new file mode 100644
index 00000000000000..bb971a82fb8ced
--- /dev/null
+++ b/lld/test/ELF/systemz-pie.s
@@ -0,0 +1,38 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t1.o
+
+## Check -pie.
+# RUN: ld.lld -pie %t1.o -o %t
+# RUN: llvm-readelf --file-headers --program-headers --dynamic %t | FileCheck %s
+
+# CHECK: ELF Header:
+# CHECK-NEXT:  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
+# CHECK-NEXT:  Class:                             ELF64
+# CHECK-NEXT:  Data:                              2's complement, big endian
+# CHECK-NEXT:  Version:                           1 (current)
+# CHECK-NEXT:  OS/ABI:                            UNIX - System V
+# CHECK-NEXT:  ABI Version:                       0
+# CHECK-NEXT:  Type:                              DYN (Shared object file)
+# CHECK-NEXT:  Machine:                           IBM S/390
+# CHECK-NEXT:  Version:                           0x1
+
+# CHECK: Program Headers:
+# CHECK-NEXT:  Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT:  PHDR           0x000040 0x0000000000000040 0x0000000000000040 0x000188 0x000188 R   0x8
+# CHECK-NEXT:  LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x00020d 0x00020d R   0x1000
+# CHECK-NEXT:  LOAD           0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000df0 RW  0x1000
+# CHECK-NEXT:  DYNAMIC        0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000090 RW  0x8
+# CHECK-NEXT:  GNU_RELRO      0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000df0 R   0x1
+# CHECK-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0x0
+
+# CHECK: Dynamic section at offset 0x210 contains 9 entries:
+# CHECK-NEXT:   Tag                Type       Name/Value
+# CHECK-NEXT:   0x000000006ffffffb (FLAGS_1)  PIE
+
+## Check -nopie
+# RUN: ld.lld -no-pie %t1.o -o %t2
+# RUN: llvm-readelf --file-headers %t2 | FileCheck %s --check-prefix=NOPIE
+# NOPIE-NOT: Type: DYN
+
+.globl _start
+_start:
diff --git a/lld/test/ELF/systemz-plt.s b/lld/test/ELF/systemz-plt.s
new file mode 100644
index 00000000000000..4669f01f588121
--- /dev/null
+++ b/lld/test/ELF/systemz-plt.s
@@ -0,0 +1,83 @@
+# REQUIRES: systemz
+# RUN: echo '.globl bar, weak; .type bar,@function; .type weak,@function; bar: weak:' > %t1.s
+
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t1.s -o %t1.o
+# RUN: ld.lld -shared %t1.o -soname=t1.so -o %t1.so
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o %t1.so -z separate-code -o %t
+# RUN: llvm-readelf -S -s -r -x .got.plt %t | FileCheck %s
+# RUN: llvm-objdump -d %t | FileCheck --check-prefixes=DIS %s
+
+# CHECK: Section Headers:
+# CHECK: .plt     PROGBITS 0000000001001020 001020 000060 00  AX  0   0 16
+# CHECK: .got     PROGBITS 00000000010020d0 0020d0 000018 00  WA  0   0  8
+# CHECK: .got.plt PROGBITS 00000000010030e8 0020e8 000010 00  WA  0   0  8
+
+# CHECK: Relocation section '.rela.plt' at offset {{.*}} contains 2 entries:
+# CHECK: 00000000010030e8 000000010000000b R_390_JMP_SLOT 0000000000000000 bar + 0
+# CHECK: 00000000010030f0 000000020000000b R_390_JMP_SLOT 0000000000000000 weak + 0
+
+## A canonical PLT has a non-zero st_value. bar and weak are called but their
+## addresses are not taken, so a canonical PLT is not necessary.
+# CHECK: Symbol table '.dynsym' contains 3 entries:
+# CHECK-NEXT:   Num:    Value          Size Type    Bind   Vis       Ndx Name
+# CHECK-NEXT:     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# CHECK-NEXT:     1: 0000000000000000     0 FUNC    GLOBAL DEFAULT   UND bar
+# CHECK-NEXT:     2: 0000000000000000     0 FUNC    WEAK   DEFAULT   UND weak
+
+## The .got.plt slots relocated by .rela.plt point to .plt
+## This is required by glibc.
+# CHECK: Hex dump of section '.got.plt':
+# CHECK-NEXT: 0x010030e8 00000000 0100104e 00000000 0100106e
+
+# DIS: Disassembly of section .text:
+
+# DIS: 0000000001001000 <_start>:
+# DIS-NEXT: brasl	%r14, 0x1001012
+# DIS-NEXT: brasl	%r14, 0x1001040
+# DIS-NEXT: brasl	%r14, 0x1001060
+
+# DIS: 0000000001001012 <foo>:
+# DIS-NEXT: br	%r14
+
+# DIS: Disassembly of section .plt:
+
+# DIS: 0000000001001020 <.plt>:
+# DIS-NEXT: 1001020: e3 10 f0 38 00 24    	stg	%r1, 56(%r15)
+# DIS-NEXT: 1001026: c0 10 00 00 08 55          larl	%r1, 0x10020d0
+# DIS-NEXT: 100102c: d2 07 f0 30 10 08    	mvc	48(8,%r15), 8(%r1)
+# DIS-NEXT: 1001032: e3 10 10 10 00 04    	lg	%r1, 16(%r1)
+# DIS-NEXT: 1001038: 07 f1        	br	%r1
+# DIS-NEXT: 100103a: 07 00        	bcr	0, %r0
+# DIS-NEXT: 100103c: 07 00        	bcr	0, %r0
+# DIS-NEXT: 100103e: 07 00        	bcr	0, %r0
+# DIS-NEXT: 1001040: c0 10 00 00 10 54    	larl	%r1, 0x10030e8
+# DIS-NEXT: 1001046: e3 10 10 00 00 04    	lg	%r1, 0(%r1)
+# DIS-NEXT: 100104c: 07 f1        	br	%r1
+# DIS-NEXT: 100104e: 0d 10        	basr	%r1, 0
+# DIS-NEXT: 1001050: e3 10 10 0c 00 14    	lgf	%r1, 12(%r1)
+# DIS-NEXT: 1001056: c0 f4 ff ff ff e5    	jg	0x1001020
+# DIS-NEXT: 100105c: 00 00        	<unknown>
+# DIS-NEXT: 100105e: 00 00        	<unknown>
+# DIS-NEXT: 1001060: c0 10 00 00 10 48    	larl	%r1, 0x10030f0
+# DIS-NEXT: 1001066: e3 10 10 00 00 04    	lg	%r1, 0(%r1)
+# DIS-NEXT: 100106c: 07 f1        	br	%r1
+# DIS-NEXT: 100106e: 0d 10        	basr	%r1, 0
+# DIS-NEXT: 1001070: e3 10 10 0c 00 14    	lgf	%r1, 12(%r1)
+# DIS-NEXT: 1001076: c0 f4 ff ff ff d5    	jg	0x1001020
+# DIS-NEXT: 100107c: 00 00        	<unknown>
+# DIS-NEXT: 100107e: 00 18        	<unknown>
+
+.global _start, foo, bar
+.weak weak
+
+_start:
+  ## Use @plt to avoid generating direct references that would force
+  ## allocation of a canonical PLT entry.
+  brasl %r14, foo@plt
+  brasl %r14, bar@plt
+  brasl %r14, weak@plt
+
+## foo is local and non-preemptable, no PLT is generated.
+foo:
+  br %r14
diff --git a/lld/test/ELF/systemz-reloc-abs.s b/lld/test/ELF/systemz-reloc-abs.s
new file mode 100644
index 00000000000000..b5ad94d90d3a96
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-abs.s
@@ -0,0 +1,32 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x %s -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs255.s -o %t255.o
+# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs256.s -o %t256.o
+# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs257.s -o %t257.o
+
+# RUN: ld.lld %t.o %t256.o -o %t
+# RUN: llvm-readelf -x .data %t | FileCheck %s
+# CHECK: 0x{{[0-9a-f]+}} ff80ffff 8000ffff ffff8000 0000ffff
+# CHECK-NEXT:            ffffffff ffff8000 00000000 0000
+
+# RUN: not ld.lld %t.o %t255.o -o /dev/null 2>&1 | FileCheck --check-prefix=OVERFLOW1 %s
+# OVERFLOW1: relocation R_390_8 out of range: -129 is not in [-128, 255]
+# OVERFLOW1: relocation R_390_16 out of range: -32769 is not in [-32768, 65535]
+# OVERFLOW1: relocation R_390_32 out of range: -2147483649 is not in [-2147483648, 4294967295]
+
+# RUN: not ld.lld %t.o %t257.o -o /dev/null 2>&1 | FileCheck --check-prefix=OVERFLOW2 %s
+# OVERFLOW2: relocation R_390_8 out of range: 256 is not in [-128, 255]
+# OVERFLOW2: relocation R_390_16 out of range: 65536 is not in [-32768, 65535]
+# OVERFLOW2: relocation R_390_32 out of range: 4294967296 is not in [-2147483648, 4294967295]
+
+.globl _start
+_start:
+.data
+.byte foo - 1
+.byte foo - 384
+.word foo + 0xfeff
+.word foo - 0x8100
+.long foo + 0xfffffeff
+.long foo - 0x80000100
+.quad foo + 0xfffffffffffffeff
+.quad foo - 0x8000000000000100
diff --git a/lld/test/ELF/systemz-reloc-disp12.s b/lld/test/ELF/systemz-reloc-disp12.s
new file mode 100644
index 00000000000000..3d32707d149fe7
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-disp12.s
@@ -0,0 +1,21 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=291 %s -o %t1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=4095 %s -o %t2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=4096 %s -o %t3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t1.o -o %t1out
+# RUN: ld.lld --section-start=.text=0x0 %t2.o -o %t2out
+# RUN: not ld.lld --section-start=.text=0x0 %t3.o -o /dev/null 2>&1 | FileCheck %s --check-prefix RANGE
+
+# RANGE: relocation R_390_12 out of range: 4096 is not in [0, 4095]
+
+# RUN: llvm-readelf --hex-dump=.text %t1out | FileCheck %s -DINSN=58678123 --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t2out | FileCheck %s -DINSN=58678fff --check-prefix DUMP
+
+# DUMP:  0x00000000 [[INSN]]
+
+.text
+.globl _start
+_start:
+    .reloc .+2, R_390_12, DISP
+    l %r6, 0(%r7,%r8)
diff --git a/lld/test/ELF/systemz-reloc-disp20.s b/lld/test/ELF/systemz-reloc-disp20.s
new file mode 100644
index 00000000000000..88cd657c6ae3cb
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-disp20.s
@@ -0,0 +1,21 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=74565 %s -o %t1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=524287 %s -o %t2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=524288 %s -o %t3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t1.o -o %t1out
+# RUN: ld.lld --section-start=.text=0x0 %t2.o -o %t2out
+# RUN: not ld.lld --section-start=.text=0x0 %t3.o -o /dev/null 2>&1 | FileCheck %s --check-prefix RANGE
+
+# RANGE: relocation R_390_20 out of range: 524288 is not in [-524288, 524287]
+
+# RUN: llvm-readelf --hex-dump=.text %t1out | FileCheck %s -DINSN="e3678345 1204" --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t2out | FileCheck %s -DINSN="e3678fff 7f04" --check-prefix DUMP
+
+# DUMP:  0x00000000 [[INSN]]
+
+.text
+.globl _start
+_start:
+    .reloc .+2, R_390_20, DISP
+    lg %r6, 0(%r7,%r8)
diff --git a/lld/test/ELF/systemz-reloc-got.s b/lld/test/ELF/systemz-reloc-got.s
new file mode 100644
index 00000000000000..4b9ac16481f4cb
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-got.s
@@ -0,0 +1,92 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -z norelro -shared %t.o -soname=t.so -o %t.so
+## Note: Without norelro the distance between .got and .got.plt causes
+## R_390_GOTPLT12 relocations to always overflow.
+
+# RUN: llvm-readelf -S -x .data %t.so | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck %s --check-prefix=DISASM
+
+# CHECK: Section Headers:
+# CHECK: .got PROGBITS 0000000000002458
+# CHECK: .got.plt PROGBITS 0000000000002480
+
+## Note: _GLOBAL_OFFSET_TABLE is at .got
+## GOT (foo) is at .got + 24 == 0x2470
+## GOT (bar) is at .got + 32 == 0x2478
+## GOTPLT (foo) is at .got.plt + 0 == .got + 40 == 0x2480
+## GOTPLT (bar) is at .got.plt + 8 == .got + 48 == 0x2488
+
+# DISASM:      larl %r12, 0x2458
+# DISASM-NEXT: larl %r1, 0x2470
+# DISASM-NEXT: larl %r1, 0x2478
+# DISASM-NEXT: larl %r1, 0x2480
+# DISASM-NEXT: larl %r1, 0x2488
+
+# DISASM-NEXT: l %r1, 24(%r12)
+# DISASM-NEXT: l %r1, 32(%r12)
+# DISASM-NEXT: l %r1, 40(%r12)
+# DISASM-NEXT: l %r1, 48(%r12)
+# DISASM-NEXT: lg %r1, 24(%r12)
+# DISASM-NEXT: lg %r1, 32(%r12)
+# DISASM-NEXT: lg %r1, 40(%r12)
+# DISASM-NEXT: lg %r1, 48(%r12)
+
+# CHECK: Hex dump of section '.data':
+# CHECK-NEXT: 00180020 00280030 00000018 00000020
+# CHECK-NEXT: 00000028 00000030 00000000 00000018
+# CHECK-NEXT: 00000000 00000020 00000000 00000028
+# CHECK-NEXT: 00000000 00000030
+
+.text
+larl %r12, _GLOBAL_OFFSET_TABLE_
+.reloc .+2, R_390_GOTENT, foo+2
+larl %r1, 0
+.reloc .+2, R_390_GOTENT, bar+2
+larl %r1, 0
+.reloc .+2, R_390_GOTPLTENT, foo+2
+larl %r1, 0
+.reloc .+2, R_390_GOTPLTENT, bar+2
+larl %r1, 0
+.reloc .+2, R_390_GOT12, foo
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOT12, bar
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT12, foo
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT12, bar
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOT20, foo
+lg %r1, 0(%r12)
+.reloc .+2, R_390_GOT20, bar
+lg %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT20, foo
+lg %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT20, bar
+lg %r1, 0(%r12)
+
+.data
+.reloc ., R_390_GOT16, foo
+.space 2
+.reloc ., R_390_GOT16, bar
+.space 2
+.reloc ., R_390_GOTPLT16, foo
+.space 2
+.reloc ., R_390_GOTPLT16, bar
+.space 2
+.reloc ., R_390_GOT32, foo
+.space 4
+.reloc ., R_390_GOT32, bar
+.space 4
+.reloc ., R_390_GOTPLT32, foo
+.space 4
+.reloc ., R_390_GOTPLT32, bar
+.space 4
+.reloc ., R_390_GOT64, foo
+.space 8
+.reloc ., R_390_GOT64, bar
+.space 8
+.reloc ., R_390_GOTPLT64, foo
+.space 8
+.reloc ., R_390_GOTPLT64, bar
+.space 8
diff --git a/lld/test/ELF/systemz-reloc-gotrel.s b/lld/test/ELF/systemz-reloc-gotrel.s
new file mode 100644
index 00000000000000..46669ecfa7fd01
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-gotrel.s
@@ -0,0 +1,36 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -shared %t.o -soname=t.so -o %t.so
+
+# RUN: llvm-readelf -S -s -x .data %t.so | FileCheck %s
+
+# CHECK: Section Headers:
+# CHECK: .plt PROGBITS 0000000000001290
+# CHECK: .got PROGBITS 0000000000002390
+
+# CHECK: Symbol table '.symtab'
+# CHECK: 0000000000001288 {{.*}}  bar
+
+## Note: foo is the first (and only) PLT entry, which resides at .plt + 32
+## PLTOFF (foo) is (.plt + 32) - .got == 0x12b0 - 0x2390 == 0xffffef20
+## GOTOFF (bar) is bar - .got == 0x1288 - 0x2390 == 0xffffeef8
+# CHECK: Hex dump of section '.data':
+# CHECK-NEXT: eef8ef20 ffffeef8 ffffef20 ffffffff
+# CHECK-NEXT: ffffeef8 ffffffff ffffef20
+
+bar:
+  br %r14
+
+.data
+.reloc ., R_390_GOTOFF16, bar
+.space 2
+.reloc ., R_390_PLTOFF16, foo
+.space 2
+.reloc ., R_390_GOTOFF, bar
+.space 4
+.reloc ., R_390_PLTOFF32, foo
+.space 4
+.reloc ., R_390_GOTOFF64, bar
+.space 8
+.reloc ., R_390_PLTOFF64, foo
+.space 8
diff --git a/lld/test/ELF/systemz-reloc-pc16.s b/lld/test/ELF/systemz-reloc-pc16.s
new file mode 100644
index 00000000000000..e1dad5af239d45
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-pc16.s
@@ -0,0 +1,39 @@
+# REQUIRES: systemz
+# RUN: rm -rf %t && split-file %s %t
+
+## Check recompile with -fPIC error message
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t/shared.s -o %t/shared.o
+# RUN: not ld.lld -shared %t/shared.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: relocation R_390_PC16 cannot be used against symbol '_shared'; recompile with -fPIC
+# CHECK: >>> defined in {{.*}}
+# CHECK: >>> referenced by {{.*}}:(.data+0x1)
+
+## Check patching of negative addends
+
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=1 %t/addend.s -o %t/1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=32768 %t/addend.s -o %t/2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=32769 %t/addend.s -o %t/3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t/1.o -o %t/1out
+# RUN: ld.lld --section-start=.text=0x0 %t/2.o -o %t/2out
+# RUN: not ld.lld --section-start=.text=0x0 %t/3.o -o /dev/null 2>&1 | FileCheck %s -DFILE=%t/3.o --check-prefix RANGE
+
+# RANGE: error: [[FILE]]:(.text+0x0): relocation R_390_PC16 out of range
+
+# RUN: llvm-readelf --hex-dump=.text %t/1out | FileCheck %s -DADDEND=ffff --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t/2out | FileCheck %s -DADDEND=8000 --check-prefix DUMP
+
+# DUMP:  0x00000000 [[ADDEND]]
+
+#--- shared.s
+.data
+ .byte 0xe8
+ .word _shared - .
+
+#--- addend.s
+.text
+.globl _start
+_start:
+    .reloc ., R_390_PC16, .text-ADDEND
+    .space 2
diff --git a/lld/test/ELF/systemz-reloc-pc32.s b/lld/test/ELF/systemz-reloc-pc32.s
new file mode 100644
index 00000000000000..0cb9322eb1c1b9
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-pc32.s
@@ -0,0 +1,39 @@
+# REQUIRES: systemz
+# RUN: rm -rf %t && split-file %s %t
+
+## Check recompile with -fPIC error message
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t/shared.s -o %t/shared.o
+# RUN: not ld.lld -shared %t/shared.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: relocation R_390_PC32 cannot be used against symbol '_shared'; recompile with -fPIC
+# CHECK: >>> defined in {{.*}}
+# CHECK: >>> referenced by {{.*}}:(.data+0x1)
+
+## Check patching of negative addends
+
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=1 %t/addend.s -o %t/1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=2147483648 %t/addend.s -o %t/2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=2147483649 %t/addend.s -o %t/3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t/1.o -o %t/1out
+# RUN: ld.lld --section-start=.text=0x0 %t/2.o -o %t/2out
+# RUN: not ld.lld --section-start=.text=0x0 %t/3.o -o /dev/null 2>&1 | FileCheck %s -DFILE=%t/3.o --check-prefix RANGE
+
+# RANGE: error: [[FILE]]:(.text+0x0): relocation R_390_PC32 out of range
+
+# RUN: llvm-readelf --hex-dump=.text %t/1out | FileCheck %s -DADDEND=ffffffff --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t/2out | FileCheck %s -DADDEND=80000000 --check-prefix DUMP
+
+# DUMP:  0x00000000 [[ADDEND]]
+
+#--- shared.s
+.data
+ .byte 0xe8
+ .long _shared - .
+
+#--- addend.s
+.text
+.globl _start
+_start:
+    .reloc ., R_390_PC32, .text-ADDEND
+    .space 4
diff --git a/lld/test/ELF/systemz-reloc-pcdbl.s b/lld/test/ELF/systemz-reloc-pcdbl.s
new file mode 100644
index 00000000000000..faee756f5e95b4
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-pcdbl.s
@@ -0,0 +1,68 @@
+# REQUIRES: systemz
+
+# RUN: llvm-mc --filetype=obj --triple=s390x-unknown-linux -mcpu=z13 %s -o %t.o
+
+# RUN: ld.lld %t.o --defsym foo16=pc16dbl+4 --defsym bar16=pc16dbl --defsym foo32=pc32dbl+6 --defsym bar32=pc32dbl --defsym foo12=pc12dbl+6 --defsym bar12=pc12dbl --defsym foo24=pc24dbl+6 --defsym bar24=pc24dbl -o %t
+# RUN: llvm-objdump --no-show-raw-insn --mcpu=z13 -d %t | FileCheck %s --check-prefix=CHECK
+# CHECK: 0000000001001120 <pc16dbl>:
+# CHECK: je      0x1001124
+# CHECK: jne     0x1001120
+# CHECK: 0000000001001128 <pc32dbl>:
+# CHECK: jge     0x100112e
+# CHECK: jgne    0x1001128
+# CHECK: 0000000001001134 <pc12dbl>:
+# CHECK: bprp    5, 0x100113a, 0x1001134
+# CHECK: bprp    6, 0x1001134, 0x100113a
+# CHECK: 0000000001001140 <pc24dbl>:
+# CHECK: bprp    5, 0x1001140, 0x1001146
+# CHECK: bprp    6, 0x1001146, 0x1001140
+
+# RUN: ld.lld %t.o --defsym foo16=pc16dbl+0xfffe --defsym bar16=pc16dbl+4-0x10000 --defsym foo32=pc32dbl+0xfffffffe --defsym bar32=pc32dbl+6-0x100000000 --defsym foo12=pc12dbl+0xffe --defsym bar12=pc12dbl+6-0x1000 --defsym foo24=pc24dbl+0xfffffe --defsym bar24=pc24dbl+6-0x1000000 -o %t.limits
+# RUN: llvm-objdump --no-show-raw-insn --mcpu=z13 -d %t.limits | FileCheck %s --check-prefix=LIMITS
+# LIMITS:      je   0x101111e
+# LIMITS-NEXT: jne  0xff1124
+# LIMITS:      jge  0x101001126
+# LIMITS-NEXT: jgne 0xffffffff0100112e
+# LIMITS:      bprp 5, 0x1002132, 0x1001134
+# LIMITS-NEXT: bprp 6, 0x100013a, 0x100113a
+# LIMITS:      bprp 5, 0x1001140, 0x200113e
+# LIMITS-NEXT: bprp 6, 0x1001146, 0x1146
+
+# RUN: not ld.lld %t.o --defsym foo16=pc16dbl+0x10000 --defsym bar16=pc16dbl+4-0x10002 --defsym foo32=pc32dbl+0x100000000 --defsym bar32=pc32dbl+6-0x100000002 --defsym foo12=pc12dbl+0x1000 --defsym bar12=pc12dbl+6-0x1002 --defsym foo24=pc24dbl+0x1000000 --defsym bar24=pc24dbl+6-0x1000002 -o /dev/null 2>&1 | FileCheck -DFILE=%t.o --check-prefix=ERROR-RANGE %s
+# ERROR-RANGE: error: [[FILE]]:(.text+0x2): relocation R_390_PC16DBL out of range: 65536 is not in [-65536, 65535]; references 'foo16'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x6): relocation R_390_PC16DBL out of range: -65538 is not in [-65536, 65535]; references 'bar16'
+# ERROR-RANGE: error: [[FILE]]:(.text+0xa): relocation R_390_PC32DBL out of range: 4294967296 is not in [-4294967296, 4294967295]; references 'foo32'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x10): relocation R_390_PC32DBL out of range: -4294967298 is not in [-4294967296, 4294967295]; references 'bar32'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x15): relocation R_390_PC12DBL out of range: 4096 is not in [-4096, 4095]; references 'foo12'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x1b): relocation R_390_PC12DBL out of range: -4098 is not in [-4096, 4095]; references 'bar12'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x23): relocation R_390_PC24DBL out of range: 16777216 is not in [-16777216, 16777215]; references 'foo24'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x29): relocation R_390_PC24DBL out of range: -16777218 is not in [-16777216, 16777215]; references 'bar24'
+
+# RUN: not ld.lld %t.o --defsym foo16=pc16dbl+1 --defsym bar16=pc16dbl-1 --defsym foo32=pc32dbl+1 --defsym bar32=pc32dbl-1 --defsym foo12=pc12dbl+1 --defsym bar12=pc12dbl-1 --defsym foo24=pc24dbl+1 --defsym bar24=pc24dbl-1 -o /dev/null 2>&1 | FileCheck -DFILE=%t.o --check-prefix=ERROR-ALIGN %s
+# ERROR-ALIGN:      error: [[FILE]]:(.text+0x2): improper alignment for relocation R_390_PC16DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x6): improper alignment for relocation R_390_PC16DBL: 0xFFFFFFFFFFFFFFFB is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0xa): improper alignment for relocation R_390_PC32DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x10): improper alignment for relocation R_390_PC32DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x15): improper alignment for relocation R_390_PC12DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x1b): improper alignment for relocation R_390_PC12DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x23): improper alignment for relocation R_390_PC24DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x29): improper alignment for relocation R_390_PC24DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes
+
+.global _start
+.global pc16dbl
+.global pc32dbl
+.global pc12dbl
+.global pc24dbl
+_start:
+pc16dbl:
+     je   foo16
+     jne  bar16
+pc32dbl:
+     jge  foo32
+     jgne bar32
+pc12dbl:
+     bprp 5,foo12,0
+     bprp 6,bar12,0
+pc24dbl:
+     bprp 5,0,foo24
+     bprp 6,0,bar24
diff --git a/lld/test/ELF/systemz-tls-gd.s b/lld/test/ELF/systemz-tls-gd.s
new file mode 100644
index 00000000000000..3976f55a6ae39e
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-gd.s
@@ -0,0 +1,142 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: echo '.tbss; .globl b, c; b: .zero 4; c:' | llvm-mc -filetype=obj -triple=s390x-unknown-linux - -o %t1.o
+# RUN: ld.lld -shared -soname=t1.so %t1.o -o %t1.so
+
+# RUN: ld.lld -shared %t.o %t1.o -o %t.so
+# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=GD-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=GD %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.so | FileCheck --check-prefix=GD-DATA %s
+
+# RUN: ld.lld %t.o %t1.o -o %t.le
+# RUN: llvm-readelf -r %t.le | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.le | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.le | FileCheck --check-prefix=LE-DATA %s
+
+# RUN: ld.lld %t.o %t1.so -o %t.ie
+# RUN: llvm-readelf -r %t.ie | FileCheck --check-prefix=IE-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.ie | FileCheck --check-prefix=IE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.ie | FileCheck --check-prefix=IE-DATA %s
+
+# GD-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 6 entries:
+# GD-REL:      0000000000002570 0000000200000036 R_390_TLS_DTPMOD 0000000000000008 a + 0
+# GD-REL-NEXT: 0000000000002578 0000000200000037 R_390_TLS_DTPOFF 0000000000000008 a + 0
+# GD-REL-NEXT: 0000000000002580 0000000300000036 R_390_TLS_DTPMOD 000000000000000c b + 0
+# GD-REL-NEXT: 0000000000002588 0000000300000037 R_390_TLS_DTPOFF 000000000000000c b + 0
+# GD-REL-NEXT: 0000000000002590 0000000400000036 R_390_TLS_DTPMOD 0000000000000010 c + 0
+# GD-REL-NEXT: 0000000000002598 0000000400000037 R_390_TLS_DTPOFF 0000000000000010 c + 0
+
+## _GLOBAL_OFFSET_TABLE is at 0x2558
+# GD:      larl    %r12, 0x2558
+
+## GOT offset of the TLS module ID / offset pair for a is at 0x2460
+# GD-NEXT: lgrl    %r2, 0x2460
+# GD-NEXT: brasl   %r14, 0x1440
+# GD-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TLS module ID / offset pair for b is at 0x2468
+# GD-NEXT: lgrl    %r2, 0x2468
+# GD-NEXT: brasl   %r14, 0x1440
+# GD-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TLS module ID / offset pair for c is at 0x2470
+# GD-NEXT: lgrl    %r2, 0x2470
+# GD-NEXT: brasl   %r14, 0x1440
+# GD-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## Constant pool holding GOT offsets of TLS module ID / offset pairs:
+# a: 0x2570 / 0x18
+# b: 0x2580 / 0x28
+# c: 0x2590 / 0x38
+# GD-DATA:      2460 00000000 00000018 00000000 00000028
+# GD-DATA-NEXT: 2470 00000000 00000038
+
+# NOREL: no relocations
+
+## _GLOBAL_OFFSET_TABLE is at 0x1002230
+# LE:      larl    %r12, 0x1002230
+
+## TP offset of a is at 0x1002218
+# LE-NEXT: lgrl    %r2, 0x1002218
+# LE-NEXT: brcl    0,
+# LE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offset of b is at 0x1002220
+# LE-NEXT: lgrl    %r2, 0x1002220
+# LE-NEXT: brcl    0,
+# LE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offset of c is at 0x1002228
+# LE-NEXT: lgrl    %r2, 0x1002228
+# LE-NEXT: brcl    0,
+# LE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offsets
+# a: -8
+# b: -4
+# c: 0
+# LE-DATA:      1002218 ffffffff fffffff8 ffffffff fffffffc
+# LE-DATA-NEXT: 1002228 00000000 00000000
+
+
+# IE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 2 entries:
+# IE-REL:      0000000001002430 0000000200000038 R_390_TLS_TPOFF 0000000000000000 b + 0
+# IE-REL-NEXT: 0000000001002438 0000000300000038 R_390_TLS_TPOFF 0000000000000000 c + 0
+
+## _GLOBAL_OFFSET_TABLE is at 0x1002418
+# IE:      larl    %r12, 0x1002418
+
+## TP offset of a is at 0x1002340
+# IE-NEXT: lgrl    %r2, 0x1002340
+# IE-NEXT: brcl    0,
+# IE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TP offset for b is at 0x1002348
+# IE-NEXT: lgrl    %r2, 0x1002348
+# IE-NEXT: lg      %r2, 0(%r2,%r12)
+# IE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TP offset for c is at 0x1002350
+# IE-NEXT: lgrl    %r2, 0x1002350
+# IE-NEXT: lg      %r2, 0(%r2,%r12)
+# IE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offsets (a) / GOT offset of TP offsets (b, c)
+# a: -4
+# b: 0x1002430 / 0x18
+# c: 0x1002438 / 0x20
+# IE-DATA:      1002340 ffffffff fffffffc 00000000 00000018
+# IE-DATA-NEXT: 1002350 00000000 00000020
+
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+larl    %r12,_GLOBAL_OFFSET_TABLE_
+
+lgrl    %r2,.LC0
+brasl   %r14,__tls_get_offset@PLT:tls_gdcall:a
+lgf     %r2,0(%r2,%r7)
+
+lgrl    %r2,.LC1
+brasl   %r14,__tls_get_offset@PLT:tls_gdcall:b
+lgf     %r2,0(%r2,%r7)
+
+lgrl    %r2,.LC2
+brasl   %r14,__tls_get_offset@PLT:tls_gdcall:c
+lgf     %r2,0(%r2,%r7)
+
+        .section        .data.rel.ro,"aw"
+        .align  8
+.LC0:
+        .quad   a@TLSGD
+.LC1:
+        .quad   b@TLSGD
+.LC2:
+        .quad   c@TLSGD
+
+	.section .tbss
+	.globl a
+	.zero 8
+a:
+	.zero 4
diff --git a/lld/test/ELF/systemz-tls-ie.s b/lld/test/ELF/systemz-tls-ie.s
new file mode 100644
index 00000000000000..85e2f24cb61f62
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-ie.s
@@ -0,0 +1,121 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=IE-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=IE %s
+# RUN: llvm-objdump --section .data --full-contents %t.so | FileCheck --check-prefix=IE-DATA %s
+
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data --full-contents %t | FileCheck --check-prefix=LE-DATA %s
+# RUN: llvm-objdump --section .got --full-contents %t | FileCheck --check-prefix=LE-GOT %s
+
+## With -pie we still have the R_390_RELATIVE for the data element, but all GOT
+## entries should be fully resolved without any remaining R_390_TLS_TPOFF.
+# RUN: ld.lld -pie %t.o -o %t.pie
+# RUN: llvm-readelf -r %t.pie | FileCheck --check-prefix=PIE-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.pie | FileCheck --check-prefix=PIE %s
+# RUN: llvm-objdump --section .data --full-contents %t.pie | FileCheck --check-prefix=PIE-DATA %s
+# RUN: llvm-objdump --section .got --full-contents %t.pie | FileCheck --check-prefix=PIE-GOT %s
+
+# IE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 4 entries:
+# IE-REL: 0000000000003478 000000000000000c R_390_RELATIVE 2460
+# IE-REL: 0000000000002460 0000000100000038 R_390_TLS_TPOFF 0000000000000008 a + 0
+# IE-REL: 0000000000002468 0000000200000038 R_390_TLS_TPOFF 000000000000000c b + 0
+# IE-REL: 0000000000002470 0000000300000038 R_390_TLS_TPOFF 0000000000000010 c + 0
+
+## TP offset for a is at 0x2460
+# IE:      lgrl    %r1, 0x2460
+# IE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x2468
+# IE-NEXT: lgrl    %r1, 0x2468
+# IE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x2470
+# IE-NEXT: lgrl    %r1, 0x2470
+# IE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## Data element: TP offset for a is at 0x2460 (relocated via R_390_RELATIVE above)
+# IE-DATA: 3478 00000000 00000000
+
+# NOREL: no relocations
+
+## TP offset for a is at 0x1002250
+# LE:      lgrl    %r1, 0x1002250
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x1002258
+# LE-NEXT: lgrl    %r1, 0x1002258
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x1002260
+# LE-NEXT: lgrl    %r1, 0x1002260
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## Data element: TP offset for a is at 0x1002250
+# LE-DATA: 00000000 01002250
+
+## TP offsets in GOT:
+# a: -8
+# b: -4
+# c: 0
+# LE-GOT: 1002238 00000000 00000000 00000000 00000000
+# LE-GOT: 1002248 00000000 00000000 ffffffff fffffff8
+# LE-GOT: 1002258 ffffffff fffffffc 00000000 00000000
+
+# PIE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# PIE-REL: 00000000000033d0 000000000000000c R_390_RELATIVE 23b8
+
+## TP offset for a is at 0x23b8
+# PIE:      lgrl    %r1, 0x23b8
+# PIE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x23c0
+# PIE-NEXT: lgrl    %r1, 0x23c0
+# PIE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x23c8
+# PIE-NEXT: lgrl    %r1, 0x23c8
+# PIE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## Data element: TP offset for a is at 0x23b8 (relocated via R_390_RELATIVE above)
+# PIE-DATA: 33d0 00000000 00000000
+
+## TP offsets in GOT:
+# a: -8
+# b: -4
+# c: 0
+# PIE-GOT: 23a0 00000000 000022d0 00000000 00000000
+# PIE-GOT: 23b0 00000000 00000000 ffffffff fffffff8
+# PIE-GOT: 23c0 ffffffff fffffffc 00000000 00000000
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+
+lgrl    %r1, a@indntpoff
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, b@indntpoff
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, c@indntpoff
+lgf     %r1,0(%r1,%r7)
+
+	.data
+	.reloc .,R_390_TLS_IE64,a
+	.space 8
+
+	.section .tbss
+	.globl a
+	.globl b
+	.globl c
+	.zero 8
+a:
+	.zero 4
+b:
+	.zero 4
+c:
diff --git a/lld/test/ELF/systemz-tls-ld.s b/lld/test/ELF/systemz-tls-ld.s
new file mode 100644
index 00000000000000..2cb36d7294f2b0
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-ld.s
@@ -0,0 +1,114 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=LD-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=LD %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.so | FileCheck --check-prefix=LD-DATA %s
+
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t | FileCheck --check-prefix=LE-DATA %s
+
+# LD-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# LD-REL: 00000000000024f8 0000000000000036 R_390_TLS_DTPMOD 0
+
+## _GLOBAL_OFFSET_TABLE is at 0x24e0
+# LD:      larl    %r12, 0x24e0
+
+## GOT offset of the LDM TLS module ID is at 0x23e0
+# LD-NEXT: lgrl    %r2, 0x23e0
+# LD-NEXT: brasl   %r14, 0x13c0
+# LD-NEXT: la      %r2, 0(%r2,%r7)
+
+## DTP offset for a is at 0x23e8
+# LD-NEXT: lgrl    %r1, 0x23e8
+# LD-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## DTP offset for b is at 0x23f0
+# LD-NEXT: lgrl    %r1, 0x23f0
+# LD-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## DTP offset for c is at 0x23f8
+# LD-NEXT: lgrl    %r1, 0x23f8
+# LD-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## Constant pool holding GOT offsets of TLS module ID and DTP offsets:
+# TLS module ID: 0x24f8 / 0x18
+# a: 8
+# b: 12
+# c: 16
+# LD-DATA: 23e0 00000000 00000018 00000000 00000008
+# LD-DATA: 23f0 00000000 0000000c 00000000 00000010
+
+# NOREL: no relocations
+
+## _GLOBAL_OFFSET_TABLE is at 0x1002230
+# LE:      larl    %r12, 0x1002230
+
+## GOT offset of the LDM TLS module ID is at 0x1002210
+# LE-NEXT: lgrl    %r2, 0x1002210
+# LE-NEXT: brcl    0,
+# LE-NEXT: la      %r2, 0(%r2,%r7)
+
+## TP offset for a is at 0x1002218
+# LE-NEXT: lgrl    %r1, 0x1002218
+# LE-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## TP offset for b is at 0x1002220
+# LE-NEXT: lgrl    %r1, 0x1002220
+# LE-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## TP offset for c is at 0x1002228
+# LE-NEXT: lgrl    %r1, 0x1002228
+# LE-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## zeroed LDM / TP offsets:
+# LDM TLS: 0
+# a: -8
+# b: -4
+# c: 0
+# LE-DATA: 1002210 00000000 00000000 ffffffff fffffff8
+# LE-DATA: 1002220 ffffffff fffffffc 00000000 00000000
+
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+larl    %r12,_GLOBAL_OFFSET_TABLE_
+
+lgrl    %r2,.LC0
+brasl   %r14,__tls_get_offset@PLT:tls_ldcall:a
+la      %r2,0(%r2,%r7)
+
+lgrl    %r1, .LC1
+lgf     %r1,0(%r1,%r2)
+
+lgrl    %r1, .LC2
+lgf     %r1,0(%r1,%r2)
+
+lgrl    %r1, .LC3
+lgf     %r1,0(%r1,%r2)
+
+        .section        .data.rel.ro,"aw"
+        .align  8
+.LC0:
+        .quad   a@TLSLDM
+.LC1:
+        .quad   a@DTPOFF
+.LC2:
+        .quad   b@DTPOFF
+.LC3:
+        .quad   c@DTPOFF
+
+	.section .tbss
+	.globl a
+	.globl b
+	.globl c
+	.zero 8
+a:
+	.zero 4
+b:
+	.zero 4
+c:
diff --git a/lld/test/ELF/systemz-tls-le.s b/lld/test/ELF/systemz-tls-le.s
new file mode 100644
index 00000000000000..9e41fc768da391
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-le.s
@@ -0,0 +1,61 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t | FileCheck --check-prefix=LE-DATA %s
+
+# NOREL: no relocations
+
+## TP offset for a is at 0x1002200
+# LE:      lgrl    %r1, 0x1002200
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x1002208
+# LE-NEXT: lgrl    %r1, 0x1002208
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x1002210
+# LE-NEXT: lgrl    %r1, 0x1002210
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offsets:
+# a: -8
+# b: -4
+# c: 0
+# LE-DATA: 1002200 ffffffff fffffff8 ffffffff fffffffc
+# LE-DATA: 1002210 00000000 00000000
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+
+lgrl    %r1, .LC0
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, .LC1
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, .LC2
+lgf     %r1,0(%r1,%r7)
+
+        .section        .data.rel.ro,"aw"
+        .align  8
+.LC0:
+        .quad   a@ntpoff
+.LC1:
+        .quad   b@ntpoff
+.LC2:
+        .quad   c@ntpoff
+
+	.section .tbss
+	.globl a
+	.globl b
+	.globl c
+	.zero 8
+a:
+	.zero 4
+b:
+	.zero 4
+c:
diff --git a/lld/test/ELF/warn-backrefs.s b/lld/test/ELF/warn-backrefs.s
index 1e5c14ed052e66..453017eb1c8ec3 100644
--- a/lld/test/ELF/warn-backrefs.s
+++ b/lld/test/ELF/warn-backrefs.s
@@ -100,6 +100,10 @@
 ## -u does not make a backward reference.
 # RUN: ld.lld --fatal-warnings --warn-backrefs -u foo %t2.a %t1.o -o /dev/null
 
+## --defsym does not make a backward reference, but it does not suppress the warning due to another file.
+# RUN: ld.lld --fatal-warnings --warn-backrefs --defsym=x=foo -e 0 %t2.a -o /dev/null
+# RUN: ld.lld --warn-backrefs --defsym=x=foo %t2.a %t1.o -o /dev/null 2>&1 | FileCheck %s
+
 # RUN: not ld.lld --warn-backrefs-exclude='[' 2>&1 | FileCheck --check-prefix=INVALID %s
 # INVALID: error: --warn-backrefs-exclude: invalid glob pattern, unmatched '[': [
 
diff --git a/lld/test/ELF/x86-64-gotpc-relax-too-far.s b/lld/test/ELF/x86-64-gotpc-relax-too-far.s
index 74aa6d8f65a0d8..ba41faab67de5c 100644
--- a/lld/test/ELF/x86-64-gotpc-relax-too-far.s
+++ b/lld/test/ELF/x86-64-gotpc-relax-too-far.s
@@ -5,7 +5,10 @@
 # RUN: llvm-objdump --no-print-imm-hex -d %t/bin | FileCheck --check-prefix=DISASM %s
 # RUN: llvm-readelf -S %t/bin | FileCheck --check-prefixes=GOT %s
 # RUN: ld.lld -T %t/lds2 %t/a.o -o %t/bin2
-# RUN: llvm-readelf -S %t/bin2 | FileCheck --check-prefixes=UNNECESSARY-GOT %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t/bin2 | FileCheck --check-prefix=DISASM %s
+# RUN: llvm-readelf -S %t/bin2 | FileCheck --check-prefixes=GOT %s
+# RUN: ld.lld -T %t/lds3 %t/a.o -o %t/bin3
+# RUN: llvm-readelf -S %t/bin3 | FileCheck --check-prefixes=UNNECESSARY-GOT %s
 
 # DISASM:      <_foo>:
 # DISASM-NEXT: movl    2097146(%rip), %eax
@@ -47,6 +50,13 @@ SECTIONS {
   data 0x80200000 : { *(data) }
 }
 #--- lds2
+SECTIONS {
+  .text.foo 0x100000 : { *(.text.foo) }
+  .text 0x1ff000 : { . = . + 0x1000 ; *(.text) }
+  .got 0x300000 : { *(.got) }
+  data 0x80200000 : { *(data) }
+}
+#--- lds3
 SECTIONS {
   .text.foo 0x100000 : { *(.text.foo) }
   .text 0x200000 : { *(.text) }
diff --git a/lld/test/MachO/invalid/invalid-lto-object-path.ll b/lld/test/MachO/invalid/invalid-lto-object-path.ll
index 75c6a97e446fb2..c862538d592ce8 100644
--- a/lld/test/MachO/invalid/invalid-lto-object-path.ll
+++ b/lld/test/MachO/invalid/invalid-lto-object-path.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 
 ;; Creating read-only directories with `chmod 400` isn't supported on Windows
 ; UNSUPPORTED: system-windows
diff --git a/lld/test/MachO/thinlto-emit-imports.ll b/lld/test/MachO/thinlto-emit-imports.ll
index 47a612bd0a7b56..88f766f59c8877 100644
--- a/lld/test/MachO/thinlto-emit-imports.ll
+++ b/lld/test/MachO/thinlto-emit-imports.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 ; RUN: rm -rf %t; split-file %s %t
 
 ; Generate summary sections and test lld handling.
diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test
index 559a32bfa242f8..057de2a22f6a0c 100644
--- a/lld/test/MinGW/driver.test
+++ b/lld/test/MinGW/driver.test
@@ -409,6 +409,13 @@ LTO_OPTS: -mllvm:-mcpu=x86-64 -opt:lldlto=2 -dwodir:foo -lto-cs-profile-generate
 RUN: ld.lld -### foo.o -m i386pep --lto-O2 --lto-CGO1 --lto-cs-profile-generate --lto-cs-profile-file=foo 2>&1 | FileCheck -check-prefix=LTO_OPTS2 %s
 LTO_OPTS2:-opt:lldlto=2 -opt:lldltocgo=1 -lto-cs-profile-generate -lto-cs-profile-file:foo
 
+RUN: ld.lld -### foo.o -m i386pe -plugin-opt=emit-asm 2>&1 | FileCheck -check-prefix=LTO_EMIT_ASM %s
+RUN: ld.lld -### foo.o -m i386pe --lto-emit-asm 2>&1 | FileCheck -check-prefix=LTO_EMIT_ASM %s
+LTO_EMIT_ASM: -lldemit:asm
+
+RUN: ld.lld -### foo.o -m i386pe -plugin-opt=emit-llvm 2>&1 | FileCheck -check-prefix=LTO_EMIT_LLVM %s
+LTO_EMIT_LLVM: -lldemit:llvm
+
 Test GCC specific LTO options that GCC passes unconditionally, that we ignore.
 
 RUN: ld.lld -### foo.o -m i386pep -plugin /usr/lib/gcc/x86_64-w64-mingw32/10-posix/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-w64-mingw32/10-posix/lto-wrapper -plugin-opt=-fresolution=/tmp/ccM9d4fP.res -plugin-opt=-pass-through=-lmingw32 2> /dev/null
diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py
index b3e07f1f823cc4..d309c2ad4ee284 100644
--- a/lld/test/lit.cfg.py
+++ b/lld/test/lit.cfg.py
@@ -83,6 +83,7 @@
                 "PowerPC": "ppc",
                 "RISCV": "riscv",
                 "Sparc": "sparc",
+                "SystemZ": "systemz",
                 "WebAssembly": "wasm",
                 "X86": "x86",
             },
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 1d230004e6c34e..f82be164ac9c48 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -19,13 +19,13 @@ if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 18)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
-  set(LLVM_VERSION_MINOR 0)
+  set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 0)
+  set(LLVM_VERSION_PATCH 5)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX git)
+  set(LLVM_VERSION_SUFFIX)
 endif()
 
 if (NOT PACKAGE_VERSION)
@@ -35,7 +35,7 @@ endif()
 
 if(NOT DEFINED LLVM_SHLIB_SYMBOL_VERSION)
   # "Symbol version prefix for libLLVM.so"
-  set(LLVM_SHLIB_SYMBOL_VERSION "LLVM_${LLVM_VERSION_MAJOR}")
+  set(LLVM_SHLIB_SYMBOL_VERSION "LLVM_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}")
 endif()
 
 if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (MSVC_TOOLSET_VERSION LESS 142) AND (CMAKE_GENERATOR_TOOLSET STREQUAL ""))
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 5e989618552824..ceec15b611140d 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -108,7 +108,7 @@ function(add_llvm_symbol_exports target_name export_file)
       COMMAND "${Python3_EXECUTABLE}" "-c"
       "import sys; \
        lines = ['    ' + l.rstrip() for l in sys.stdin] + ['  local: *;']; \
-       print('LLVM_${LLVM_VERSION_MAJOR} {'); \
+       print('LLVM_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR} {'); \
        print('  global:') if len(lines) > 1 else None; \
        print(';\\n'.join(lines) + '\\n};')"
       < ${export_file} > ${native_export_file}
@@ -646,9 +646,9 @@ function(llvm_add_library name)
     if(UNIX AND NOT APPLE AND NOT ARG_SONAME)
       set_target_properties(${name}
         PROPERTIES
-        # Since 4.0.0, the ABI version is indicated by the major version
-        SOVERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX}
-        VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX})
+        # Since 18.1.0, the ABI version is indicated by the major and minor version.
+        SOVERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}${LLVM_VERSION_SUFFIX}
+        VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}${LLVM_VERSION_SUFFIX})
     endif()
   endif()
 
@@ -2074,7 +2074,7 @@ function(add_lit_testsuites project directory)
 endfunction()
 
 function(llvm_install_library_symlink name dest type)
-  cmake_parse_arguments(ARG "" "COMPONENT" "" ${ARGN})
+  cmake_parse_arguments(ARG "FULL_DEST" "COMPONENT" "" ${ARGN})
   foreach(path ${CMAKE_MODULE_PATH})
     if(EXISTS ${path}/LLVMInstallSymlink.cmake)
       set(INSTALL_SYMLINK ${path}/LLVMInstallSymlink.cmake)
@@ -2088,7 +2088,11 @@ function(llvm_install_library_symlink name dest type)
   endif()
 
   set(full_name ${CMAKE_${type}_LIBRARY_PREFIX}${name}${CMAKE_${type}_LIBRARY_SUFFIX})
-  set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX})
+  if (ARG_FULL_DEST)
+    set(full_dest ${dest})
+  else()
+    set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX})
+  endif()
 
   if(LLVM_USE_SYMLINKS)
     set(LLVM_LINK_OR_COPY create_symlink)
diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index 74e1c6bf52e230..770a9caea322e6 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -90,6 +90,11 @@ if(LLVM_ENABLE_CURL)
   find_package(CURL)
 endif()
 
+set(LLVM_ENABLE_HTTPLIB @LLVM_ENABLE_HTTPLIB@)
+if(LLVM_ENABLE_HTTPLIB)
+  find_package(httplib)
+endif()
+
 set(LLVM_WITH_Z3 @LLVM_WITH_Z3@)
 
 set(LLVM_ENABLE_DIA_SDK @LLVM_ENABLE_DIA_SDK@)
diff --git a/llvm/docs/AdvancedBuilds.rst b/llvm/docs/AdvancedBuilds.rst
index 960b19fa5317f3..ee178dd3772c4b 100644
--- a/llvm/docs/AdvancedBuilds.rst
+++ b/llvm/docs/AdvancedBuilds.rst
@@ -145,6 +145,29 @@ that also enables ThinTLO, use the following command:
       -DPGO_INSTRUMENT_LTO=Thin \
       <path to source>/llvm
 
+By default, clang will generate profile data by compiling a simple
+hello world program.  You can also tell clang use an external
+project for generating profile data that may be a better fit for your
+use case.  The project you specify must either be a lit test suite
+(use the CLANG_PGO_TRAINING_DATA option) or a CMake project (use the
+CLANG_PERF_TRAINING_DATA_SOURCE_DIR option).
+
+For example, If you wanted to use the
+`LLVM Test Suite <https://github.com/llvm/llvm-test-suite/>`_ to generate
+profile data you would use the following command:
+
+.. code-block:: console
+
+  $ cmake -G Ninja -C <path to source>/clang/cmake/caches/PGO.cmake \
+       -DBOOTSTRAP_CLANG_PGO_TRAINING_DATA_SOURCE_DIR=<path to llvm-test-suite> \
+       -DBOOTSTRAP_CLANG_PGO_TRAINING_DEPS=runtimes
+
+The BOOTSTRAP\_ prefixes tells CMake to pass the variables on to the instrumented
+stage two build.  And the CLANG_PGO_TRAINING_DEPS option let's you specify
+additional build targets to build before building the external project.  The
+LLVM Test Suite requires compiler-rt to build, so we need to add the
+`runtimes` target as a dependency.
+
 After configuration, building the stage2-instrumented-generate-profdata target
 will automatically build the stage1 compiler, build the instrumented compiler
 with the stage1 compiler, and then run the instrumented compiler against the
@@ -172,12 +195,12 @@ You can feed that file into the LLVM_PROFDATA_FILE option when you build your
 optimized compiler.
 
 It may be necessary to build additional targets before running perf training, such as
-builtins and runtime libraries. You can use the :code:`CLANG_PERF_TRAINING_DEPS` CMake
+builtins and runtime libraries. You can use the :code:`CLANG_PGO_TRAINING_DEPS` CMake
 variable for that purpose:
 
 .. code-block:: cmake
 
-  set(CLANG_PERF_TRAINING_DEPS builtins runtimes CACHE STRING "")
+  set(CLANG_PGO_TRAINING_DEPS builtins runtimes CACHE STRING "")
 
 The PGO cache has a slightly different stage naming scheme than other
 multi-stage builds. It generates three stages: stage1, stage2-instrumented, and
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 6e13cd94b92fda..01a96f2f4f770e 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -538,6 +538,10 @@ options. For GNU :program:`objcopy` compatibility, the values are all bfdnames.
 - `elf64-tradlittlemips`
 - `elf32-sparc`
 - `elf32-sparcel`
+- `elf32-hexagon`
+- `elf32-loongarch`
+- `elf64-loongarch`
+- `elf64-s390`
 
 Additionally, all targets except `binary` and `ihex` can have `-freebsd` as a
 suffix.
diff --git a/llvm/docs/CommandGuide/llvm-readelf.rst b/llvm/docs/CommandGuide/llvm-readelf.rst
index 6ee4a5dfb15917..675628fdda45ec 100644
--- a/llvm/docs/CommandGuide/llvm-readelf.rst
+++ b/llvm/docs/CommandGuide/llvm-readelf.rst
@@ -38,6 +38,11 @@ OPTIONS
  Display the contents of the basic block address map section(s), which contain the
  address of each function, along with the relative offset of each basic block.
 
+.. option:: --decompress, -z
+
+  Dump decompressed section content when used with ``-x`` or ``-p``.
+  If the section(s) are not compressed, they are displayed as is.
+
 .. option:: --demangle, -C
 
  Display demangled symbol names in the output.
diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst
index cb9232ef5e560a..6d78a038723445 100644
--- a/llvm/docs/CommandGuide/llvm-readobj.rst
+++ b/llvm/docs/CommandGuide/llvm-readobj.rst
@@ -56,6 +56,11 @@ file formats.
 
  Display the address-significance table.
 
+.. option:: --decompress, -z
+
+  Dump decompressed section content when used with ``-x`` or ``-p``.
+  If the section(s) are not compressed, they are displayed as is.
+
 .. option:: --expand-relocs
 
  When used with :option:`--relocs`, display each relocation in an expanded
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 7a7ddc59ba985d..74b0439da7fc58 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -5499,6 +5499,8 @@ RISC-V:
 
 Sparc:
 
+- ``L``: Print the low-order register of a two-register operand.
+- ``H``: Print the high-order register of a two-register operand.
 - ``r``: No effect.
 
 SystemZ:
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 6fdc945ad27078..ba292ea39c8a8a 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -130,6 +130,7 @@ on support follow.
      ``Zicclsm``      Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Ziccrse``      Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zicntr``       (`See Note <#riscv-i2p1-note>`__)
+     ``Zicond``       Supported
      ``Zicsr``        (`See Note <#riscv-i2p1-note>`__)
      ``Zifencei``     (`See Note <#riscv-i2p1-note>`__)
      ``Zihintntl``    Supported
@@ -234,9 +235,6 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zicfilp``, ``experimental-zicfiss``
   LLVM implements the `0.4 draft specification <https://github.com/riscv/riscv-cfi/releases/tag/v0.4.0>`__.
 
-``experimental-zicond``
-  LLVM implements the `1.0-rc1 draft specification <https://github.com/riscv/riscv-zicond/releases/tag/v1.0-rc1>`__.
-
 ``experimental-ztso``
   LLVM implements the `v0.1 proposed specification <https://github.com/riscv/riscv-isa-manual/releases/download/draft-20220723-10eea63/riscv-spec.pdf>`__ (see Chapter 25).  The mapping from the C/C++ memory model to Ztso has not yet been ratified in any standards document.  There are multiple possible mappings, and they are *not* mutually ABI compatible.  The mapping LLVM implements is ABI compatible with the default WMO mapping.  This mapping may change and there is *explicitly* no ABI stability offered while the extension remains in experimental status.  User beware.
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 7b6a3f10d63777..ff929b0bc5e15b 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -105,6 +105,14 @@ Changes to the AArch64 Backend
   Armv9.0a has the same features enabled as Armv8.5a, with the exception
   of crypto.
 
+* Assembler/disassembler support has been added for 2023 architecture
+  extensions.
+
+* Support has been added for Stack Clash Protection. During function frame
+  creation and dynamic stack allocations, the compiler will issue memory
+  accesses at reguilar intervals so that a guard area at the top of the stack
+  can't be skipped over.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
@@ -156,6 +164,30 @@ Changes to the MIPS Backend
 Changes to the PowerPC Backend
 ------------------------------
 
+* LLJIT's JIT linker now defaults to JITLink on 64-bit ELFv2 targets.
+* Initial-exec TLS model is supported on AIX.
+* Implemented new resource based scheduling model of POWER7 and POWER8.
+* ``frexp`` libcall now references correct symbol name for ``fp128``.
+* Optimized materialization of 64-bit immediates, code generation of
+  ``vec_promote`` and atomics.
+* Global constant strings are pooled in the TOC under one entry to reduce the
+  number of entries in the TOC.
+* Added a number of missing Power10 extended mnemonics.
+* Added the SCV instruction.
+* Fixed register class for the paddi instruction.
+* Optimize VPERM and fix code order for swapping vector operands on LE.
+* Added various bug fixes and code gen improvements.
+
+AIX Support/improvements:
+
+* Support for a non-TOC-based access sequence for the local-exec TLS model (called small local-exec).
+* XCOFF toc-data peephole optimization and bug fixes.
+* Move less often used __ehinfo TOC entries to the end of the TOC section.
+* Fixed problems when the AIX libunwind unwinds starting from a signal handler
+  and the function that raised the signal happens to be a leaf function that
+  shares the stack frame with its caller or a leaf function that does not store
+  the stack frame backchain.
+
 Changes to the RISC-V Backend
 -----------------------------
 
@@ -181,6 +213,18 @@ Changes to the RISC-V Backend
   specification.
 * The Smepmp 1.0 extension is now supported.
 * ``-mcpu=sifive-p670`` was added.
+* Support for the Zicond extension is no longer experimental.
+
+Changes to the SystemZ Backend
+------------------------------
+
+* Properly support 16 byte atomic int/fp types and ops.
+* Support i128 as legal type in VRs.
+* Add an i128 cost model.
+* Support building individual functions with backchain using the
+  __attribute__((target("backchain"))) syntax.
+* Add exception handling for XPLINK.
+* Add support for llvm-objcopy.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -306,18 +350,44 @@ Changes to the Debug Info
 Changes to the LLVM tools
 ---------------------------------
 
-* llvm-symbolizer now treats invalid input as an address for which source
+* ``llvm-symbolizer`` now treats invalid input as an address for which source
   information is not found.
-* llvm-readelf now supports ``--extra-sym-info`` (``-X``) to display extra
+* Fixed big-endian support in ``llvm-symbolizer``'s DWARF location parser.
+* ``llvm-readelf`` now supports ``--extra-sym-info`` (``-X``) to display extra
   information (section name) when showing symbols.
+* ``llvm-readobj``/``llvm-readelf`` now supports ``--decompress``/``-z`` with
+  string and hex dump for ELF object files.
+
+* ``llvm-symbolizer`` and ``llvm-addr2line`` now support addresses specified as symbol names.
+
+* ``llvm-objcopy`` now supports ``--gap-fill`` and ``--pad-to`` options, for
+  ELF input and binary output files only.
+* ``llvm-objcopy`` now supports ``-O elf64-s390`` for SystemZ.
+
+* Supported parsing XCOFF auxiliary symbols in ``obj2yaml``.
 
+* ``llvm-ranlib`` now supports ``-X`` on AIX to specify the type of object file
+  ranlib should examine.
+
+* ``llvm-cxxfilt`` now supports ``--no-params``/``-p`` to skip function
+  parameters.
+
+* ``llvm-nm`` now supports ``--export-symbol`` to ignore the import symbol file.
 * ``llvm-nm`` now supports the ``--line-numbers`` (``-l``) option to use
   debugging information to print symbols' filenames and line numbers.
 
-* llvm-symbolizer and llvm-addr2line now support addresses specified as symbol names.
+* ``llvm-rc`` and ``llvm-windres`` now accept file path references in ``.rc`` files
+  concatenated from multiple string literals.
 
-* llvm-objcopy now supports ``--gap-fill`` and ``--pad-to`` options, for
-  ELF input and binary output files only.
+* The ``llvm-windres`` option ``--preprocessor`` now resolves its argument
+  in the ``PATH`` environment variable as expected, and options passed with
+  ``--preprocessor-arg`` are placed before the input file as they should
+  be.
+
+* The ``llvm-windres`` option ``--preprocessor`` has been updated with the
+  breaking behaviour change from GNU windres from binutils 2.36, where
+  the whole argument is considered as one path, not considered as a
+  sequence of tool name and parameters.
 
 Changes to LLDB
 ---------------------------------
@@ -359,10 +429,25 @@ Changes to LLDB
   fields are present, however this is not always possible or entirely
   accurate. If in doubt, refer to the numerical value.
 
+* On Windows, LLDB can now read the thread names.
+
 Changes to Sanitizers
 ---------------------
 * HWASan now defaults to detecting use-after-scope bugs.
 
+* `SpecialCaseList <https://clang.llvm.org/docs/SanitizerSpecialCaseList.html#format>`_
+  used by sanitizer ignore lists (e.g. ``*_ignorelist.txt`` in the Clang
+  resource directory) now uses glob patterns instead of a variant of POSIX
+  Extended Regular Expression (where ``*`` is translated to ``.*``) by default.
+  Search for ``|`` to find patterns that may have different meanings now, and
+  replace ``a|b`` with ``{a,b}``.
+
+Changes to the Profile Runtime
+------------------------------
+
+* Public header ``profile/instr_prof_interface.h`` is added to declare four
+  API functions to fine tune profile collection.
+
 Other Changes
 -------------
 
diff --git a/llvm/include/llvm/ADT/iterator_range.h b/llvm/include/llvm/ADT/iterator_range.h
index 2dc227935984b1..7d288ea4506ba5 100644
--- a/llvm/include/llvm/ADT/iterator_range.h
+++ b/llvm/include/llvm/ADT/iterator_range.h
@@ -43,8 +43,8 @@ class iterator_range {
   IteratorT begin_iterator, end_iterator;
 
 public:
-#if __GNUC__ == 7
-  // Be careful no to break gcc-7 on the mlir target.
+#if __GNUC__ == 7 || (__GNUC__ == 8 && __GNUC_MINOR__ < 4)
+  // Be careful no to break gcc-7 and gcc-8 < 8.4 on the mlir target.
   // See https://github.com/llvm/llvm-project/issues/63843
   template <typename Container>
 #else
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index d6f732d35fd4cd..e8e4f491be5a3d 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -287,6 +287,10 @@ class AAQueryInfo {
   ///   store %l, ...
   bool MayBeCrossIteration = false;
 
+  /// Whether alias analysis is allowed to use the dominator tree, for use by
+  /// passes that lazily update the DT while performing AA queries.
+  bool UseDominatorTree = true;
+
   AAQueryInfo(AAResults &AAR, CaptureInfo *CI) : AAR(AAR), CI(CI) {}
 };
 
@@ -668,6 +672,9 @@ class BatchAAResults {
   void enableCrossIterationMode() {
     AAQI.MayBeCrossIteration = true;
   }
+
+  /// Disable the use of the dominator tree during alias analysis queries.
+  void disableDominatorTree() { AAQI.UseDominatorTree = false; }
 };
 
 /// Temporary typedef for legacy code that uses a generic \c AliasAnalysis
diff --git a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
index afc1811239f283..7eca82729430dd 100644
--- a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
@@ -43,20 +43,26 @@ class BasicAAResult : public AAResultBase {
   const Function &F;
   const TargetLibraryInfo &TLI;
   AssumptionCache &AC;
-  DominatorTree *DT;
+  /// Use getDT() instead of accessing this member directly, in order to
+  /// respect the AAQI.UseDominatorTree option.
+  DominatorTree *DT_;
+
+  DominatorTree *getDT(const AAQueryInfo &AAQI) const {
+    return AAQI.UseDominatorTree ? DT_ : nullptr;
+  }
 
 public:
   BasicAAResult(const DataLayout &DL, const Function &F,
                 const TargetLibraryInfo &TLI, AssumptionCache &AC,
                 DominatorTree *DT = nullptr)
-      : DL(DL), F(F), TLI(TLI), AC(AC), DT(DT) {}
+      : DL(DL), F(F), TLI(TLI), AC(AC), DT_(DT) {}
 
   BasicAAResult(const BasicAAResult &Arg)
       : AAResultBase(Arg), DL(Arg.DL), F(Arg.F), TLI(Arg.TLI), AC(Arg.AC),
-        DT(Arg.DT) {}
+        DT_(Arg.DT_) {}
   BasicAAResult(BasicAAResult &&Arg)
       : AAResultBase(std::move(Arg)), DL(Arg.DL), F(Arg.F), TLI(Arg.TLI),
-        AC(Arg.AC), DT(Arg.DT) {}
+        AC(Arg.AC), DT_(Arg.DT_) {}
 
   /// Handle invalidation events in the new pass manager.
   bool invalidate(Function &Fn, const PreservedAnalyses &PA,
diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index 6b9d1781820111..91e1872e9bd6ff 100644
--- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -122,16 +122,23 @@ class BranchProbabilityInfo {
   }
 
   BranchProbabilityInfo(BranchProbabilityInfo &&Arg)
-      : Probs(std::move(Arg.Probs)), LastF(Arg.LastF),
-        EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) {}
+      : Handles(std::move(Arg.Handles)), Probs(std::move(Arg.Probs)),
+        LastF(Arg.LastF),
+        EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) {
+    for (auto &Handle : Handles)
+      Handle.setBPI(this);
+  }
 
   BranchProbabilityInfo(const BranchProbabilityInfo &) = delete;
   BranchProbabilityInfo &operator=(const BranchProbabilityInfo &) = delete;
 
   BranchProbabilityInfo &operator=(BranchProbabilityInfo &&RHS) {
     releaseMemory();
+    Handles = std::move(RHS.Handles);
     Probs = std::move(RHS.Probs);
     EstimatedBlockWeight = std::move(RHS.EstimatedBlockWeight);
+    for (auto &Handle : Handles)
+      Handle.setBPI(this);
     return *this;
   }
 
@@ -279,6 +286,8 @@ class BranchProbabilityInfo {
     }
 
   public:
+    void setBPI(BranchProbabilityInfo *BPI) { this->BPI = BPI; }
+
     BasicBlockCallbackVH(const Value *V, BranchProbabilityInfo *BPI = nullptr)
         : CallbackVH(const_cast<Value *>(V)), BPI(BPI) {}
   };
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 2880ed33a34cbc..0926093bba99de 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -18,7 +18,7 @@
 
 namespace llvm {
 
-class AAResults;
+class BatchAAResults;
 class AssumptionCache;
 class DataLayout;
 class DominatorTree;
@@ -129,11 +129,10 @@ extern cl::opt<unsigned> DefMaxInstsToScan;
 /// location in memory, as opposed to the value operand of a store.
 ///
 /// \returns The found value, or nullptr if no value is found.
-Value *FindAvailableLoadedValue(LoadInst *Load,
-                                BasicBlock *ScanBB,
+Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
                                 BasicBlock::iterator &ScanFrom,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan,
-                                AAResults *AA = nullptr,
+                                BatchAAResults *AA = nullptr,
                                 bool *IsLoadCSE = nullptr,
                                 unsigned *NumScanedInst = nullptr);
 
@@ -141,7 +140,8 @@ Value *FindAvailableLoadedValue(LoadInst *Load,
 /// FindAvailableLoadedValue() for the case where we are not interested in
 /// finding the closest clobbering instruction if no available load is found.
 /// This overload cannot be used to scan across multiple blocks.
-Value *FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, bool *IsLoadCSE,
+Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
+                                bool *IsLoadCSE,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan);
 
 /// Scan backwards to see if we have the value of the given pointer available
@@ -170,7 +170,7 @@ Value *FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, bool *IsLoadCSE,
 Value *findAvailablePtrLoadStore(const MemoryLocation &Loc, Type *AccessTy,
                                  bool AtLeastAtomic, BasicBlock *ScanBB,
                                  BasicBlock::iterator &ScanFrom,
-                                 unsigned MaxInstsToScan, AAResults *AA,
+                                 unsigned MaxInstsToScan, BatchAAResults *AA,
                                  bool *IsLoadCSE, unsigned *NumScanedInst);
 
 /// Returns true if a pointer value \p A can be replace with another pointer
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index af3ad822e0b0de..0880f9c65aa45d 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1314,6 +1314,13 @@ class ScalarEvolution {
   void getPoisonGeneratingValues(SmallPtrSetImpl<const Value *> &Result,
                                  const SCEV *S);
 
+  /// Check whether it is poison-safe to represent the expression S using the
+  /// instruction I. If such a replacement is performed, the poison flags of
+  /// instructions in DropPoisonGeneratingInsts must be dropped.
+  bool canReuseInstruction(
+      const SCEV *S, Instruction *I,
+      SmallVectorImpl<Instruction *> &DropPoisonGeneratingInsts);
+
   class FoldID {
     const SCEV *Op = nullptr;
     const Type *Ty = nullptr;
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index f09e12f3038cac..07edf68c667a27 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -771,8 +771,8 @@ TLI_DEFINE_VECFUNC("log2f", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("llvm.log2.f64", "_ZGVsMxv_log2", SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("modf", "_ZGVsMxvl8_modf", SCALABLE(2), MASKED, "_ZGVsMxvl8")
-TLI_DEFINE_VECFUNC("modff", "_ZGVsMxvl4_modff", SCALABLE(4), MASKED, "_ZGVsMxvl4")
+TLI_DEFINE_VECFUNC("modf", "_ZGVsNxvl8_modf", SCALABLE(2), NOMASK, "_ZGVsNxvl8")
+TLI_DEFINE_VECFUNC("modff", "_ZGVsNxvl4_modff", SCALABLE(4), NOMASK, "_ZGVsNxvl4")
 
 TLI_DEFINE_VECFUNC("nextafter", "_ZGVsMxvv_nextafter", SCALABLE(2), MASKED, "_ZGVsMxvv")
 TLI_DEFINE_VECFUNC("nextafterf", "_ZGVsMxvv_nextafterf", SCALABLE(4), MASKED, "_ZGVsMxvv")
@@ -787,11 +787,11 @@ TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("sincos", "_ZGVsMxvl8l8_sincos", SCALABLE(2), MASKED, "_ZGVsMxvl8l8")
-TLI_DEFINE_VECFUNC("sincosf", "_ZGVsMxvl4l4_sincosf", SCALABLE(4), MASKED, "_ZGVsMxvl4l4")
+TLI_DEFINE_VECFUNC("sincos", "_ZGVsNxvl8l8_sincos", SCALABLE(2), NOMASK, "_ZGVsNxvl8l8")
+TLI_DEFINE_VECFUNC("sincosf", "_ZGVsNxvl4l4_sincosf", SCALABLE(4), NOMASK, "_ZGVsNxvl4l4")
 
-TLI_DEFINE_VECFUNC("sincospi", "_ZGVsMxvl8l8_sincospi", SCALABLE(2), MASKED, "_ZGVsMxvl8l8")
-TLI_DEFINE_VECFUNC("sincospif", "_ZGVsMxvl4l4_sincospif", SCALABLE(4), MASKED, "_ZGVsMxvl4l4")
+TLI_DEFINE_VECFUNC("sincospi", "_ZGVsNxvl8l8_sincospi", SCALABLE(2), NOMASK, "_ZGVsNxvl8l8")
+TLI_DEFINE_VECFUNC("sincospif", "_ZGVsNxvl4l4_sincospif", SCALABLE(4), NOMASK, "_ZGVsNxvl4l4")
 
 TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv")
@@ -1005,8 +1005,6 @@ TLI_DEFINE_VECFUNC("llvm.log2.f32", "armpl_svlog2_f32_x", SCALABLE(4), MASKED, "
 
 TLI_DEFINE_VECFUNC("modf", "armpl_vmodfq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vl8")
 TLI_DEFINE_VECFUNC("modff", "armpl_vmodfq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4")
-TLI_DEFINE_VECFUNC("modf", "armpl_svmodf_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvl8")
-TLI_DEFINE_VECFUNC("modff", "armpl_svmodf_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvl4")
 
 TLI_DEFINE_VECFUNC("nextafter", "armpl_vnextafterq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
 TLI_DEFINE_VECFUNC("nextafterf", "armpl_vnextafterq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
@@ -1035,13 +1033,9 @@ TLI_DEFINE_VECFUNC("llvm.sin.f32", "armpl_svsin_f32_x", SCALABLE(4), MASKED, "_Z
 
 TLI_DEFINE_VECFUNC("sincos", "armpl_vsincosq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vl8l8")
 TLI_DEFINE_VECFUNC("sincosf", "armpl_vsincosq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4")
-TLI_DEFINE_VECFUNC("sincos", "armpl_svsincos_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvl8l8")
-TLI_DEFINE_VECFUNC("sincosf", "armpl_svsincos_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvl4l4")
 
 TLI_DEFINE_VECFUNC("sincospi", "armpl_vsincospiq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vl8l8")
 TLI_DEFINE_VECFUNC("sincospif", "armpl_vsincospiq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4")
-TLI_DEFINE_VECFUNC("sincospi", "armpl_svsincospi_f64_x", SCALABLE(2), MASKED, "_ZGVsMxvl8l8")
-TLI_DEFINE_VECFUNC("sincospif", "armpl_svsincospi_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvl4l4")
 
 TLI_DEFINE_VECFUNC("sinh", "armpl_vsinhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("sinhf", "armpl_vsinhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 7a92e62b53c53d..c6eb66cc9660ca 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -406,6 +406,11 @@ bool maskIsAllZeroOrUndef(Value *Mask);
 /// lanes can be assumed active.
 bool maskIsAllOneOrUndef(Value *Mask);
 
+/// Given a mask vector of i1, Return true if any of the elements of this
+/// predicate mask are known to be true or undef.  That is, return true if at
+/// least one lane can be assumed active.
+bool maskContainsAllOneOrUndef(Value *Mask);
+
 /// Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y)
 /// for each lane which may be active.
 APInt possiblyDemandedEltsInMask(Value *Mask);
diff --git a/llvm/include/llvm/BinaryFormat/COFF.h b/llvm/include/llvm/BinaryFormat/COFF.h
index 522ee37da6e830..72461d0d9c316a 100644
--- a/llvm/include/llvm/BinaryFormat/COFF.h
+++ b/llvm/include/llvm/BinaryFormat/COFF.h
@@ -716,7 +716,10 @@ enum ImportNameType : unsigned {
   IMPORT_NAME_NOPREFIX = 2,
   /// The import name is the public symbol name, but skipping the leading ?,
   /// @, or optionally _, and truncating at the first @.
-  IMPORT_NAME_UNDECORATE = 3
+  IMPORT_NAME_UNDECORATE = 3,
+  /// The import name is specified as a separate string in the import library
+  /// object file.
+  IMPORT_NAME_EXPORTAS = 4
 };
 
 enum class GuardFlags : uint32_t {
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 81cdd39afc6bab..f17ba75e3efa6a 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1464,6 +1464,7 @@ enum {
   PT_OPENBSD_RANDOMIZE = 0x65a3dbe6, // Fill with random data.
   PT_OPENBSD_WXNEEDED = 0x65a3dbe7,  // Program does W^X violations.
   PT_OPENBSD_NOBTCFI = 0x65a3dbe8,   // Do not enforce branch target CFI.
+  PT_OPENBSD_SYSCALLS = 0x65a3dbe9,  // System call sites.
   PT_OPENBSD_BOOTDATA = 0x65a41be6,  // Section for boot arguments.
 
   // ARM program header types.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
index 0f20a33f3a755c..7990997835d019 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
@@ -35,11 +35,23 @@ struct LegalityQuery;
 class MachineRegisterInfo;
 namespace GISelAddressing {
 /// Helper struct to store a base, index and offset that forms an address
-struct BaseIndexOffset {
+class BaseIndexOffset {
+private:
   Register BaseReg;
   Register IndexReg;
-  int64_t Offset = 0;
-  bool IsIndexSignExt = false;
+  std::optional<int64_t> Offset;
+
+public:
+  BaseIndexOffset() = default;
+  Register getBase() { return BaseReg; }
+  Register getBase() const { return BaseReg; }
+  Register getIndex() { return IndexReg; }
+  Register getIndex() const { return IndexReg; }
+  void setBase(Register NewBase) { BaseReg = NewBase; }
+  void setIndex(Register NewIndex) { IndexReg = NewIndex; }
+  void setOffset(std::optional<int64_t> NewOff) { Offset = NewOff; }
+  bool hasValidOffset() const { return Offset.has_value(); }
+  int64_t getOffset() const { return *Offset; }
 };
 
 /// Returns a BaseIndexOffset which describes the pointer in \p Ptr.
@@ -89,7 +101,7 @@ class LoadStoreOpt : public MachineFunctionPass {
     // order stores are writing to incremeneting consecutive addresses. So when
     // we walk the block in reverse order, the next eligible store must write to
     // an offset one store width lower than CurrentLowestOffset.
-    uint64_t CurrentLowestOffset;
+    int64_t CurrentLowestOffset;
     SmallVector<GStore *> Stores;
     // A vector of MachineInstr/unsigned pairs to denote potential aliases that
     // need to be checked before the candidate is considered safe to merge. The
diff --git a/llvm/include/llvm/CodeGen/LivePhysRegs.h b/llvm/include/llvm/CodeGen/LivePhysRegs.h
index 76bb34d270a26d..1d40b1cbb0eaa3 100644
--- a/llvm/include/llvm/CodeGen/LivePhysRegs.h
+++ b/llvm/include/llvm/CodeGen/LivePhysRegs.h
@@ -193,11 +193,18 @@ void addLiveIns(MachineBasicBlock &MBB, const LivePhysRegs &LiveRegs);
 void computeAndAddLiveIns(LivePhysRegs &LiveRegs,
                           MachineBasicBlock &MBB);
 
-/// Convenience function for recomputing live-in's for \p MBB.
-static inline void recomputeLiveIns(MachineBasicBlock &MBB) {
+/// Convenience function for recomputing live-in's for a MBB. Returns true if
+/// any changes were made.
+static inline bool recomputeLiveIns(MachineBasicBlock &MBB) {
   LivePhysRegs LPR;
+  auto oldLiveIns = MBB.getLiveIns();
+
   MBB.clearLiveIns();
   computeAndAddLiveIns(LPR, MBB);
+  MBB.sortUniqueLiveIns();
+
+  auto newLiveIns = MBB.getLiveIns();
+  return oldLiveIns != newLiveIns;
 }
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index c84fd281c6a549..dc2035fa598c46 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -111,6 +111,10 @@ class MachineBasicBlock
 
     RegisterMaskPair(MCPhysReg PhysReg, LaneBitmask LaneMask)
         : PhysReg(PhysReg), LaneMask(LaneMask) {}
+
+    bool operator==(const RegisterMaskPair &other) const {
+      return PhysReg == other.PhysReg && LaneMask == other.LaneMask;
+    }
   };
 
 private:
@@ -473,6 +477,8 @@ class MachineBasicBlock
   /// Remove entry from the livein set and return iterator to the next.
   livein_iterator removeLiveIn(livein_iterator I);
 
+  std::vector<RegisterMaskPair> getLiveIns() const { return LiveIns; }
+
   class liveout_iterator {
   public:
     using iterator_category = std::input_iterator_tag;
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 864f87f3383891..d22eb76d2292d5 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -339,14 +339,26 @@ def UseSampleProfile : StrBoolAttr<"use-sample-profile">;
 def DenormalFPMath : ComplexStrAttr<"denormal-fp-math", [FnAttr]>;
 def DenormalFPMathF32 : ComplexStrAttr<"denormal-fp-math-f32", [FnAttr]>;
 
+// Attribute compatiblity rules are generated to check the attribute of the
+// caller and callee and decide whether inlining should be allowed. CompatRule
+// and child classes are used for the rule generation. CompatRule takes only a
+// compare function which could be templated with the attribute type.
+// CompatRuleStrAttr takes the compare function and the string attribute for
+// checking compatibility for inline substitution.
 class CompatRule<string F> {
-  // The name of the function called to check the attribute of the caller and
-  // callee and decide whether inlining should be allowed. The function's
-  // signature must match "bool(const Function&, const Function &)", where the
-  // first parameter is the reference to the caller and the second parameter is
-  // the reference to the callee. It must return false if the attributes of the
-  // caller and callee are incompatible, and true otherwise.
+  // The function's signature must match "bool(const Function&, const
+  // Function&)", where the first parameter is the reference to the caller and
+  // the second parameter is the reference to the callee. It must return false
+  // if the attributes of the caller and callee are incompatible, and true
+  // otherwise.
   string CompatFunc = F;
+  string AttrName = "";
+}
+
+class CompatRuleStrAttr<string F, string Attr> : CompatRule<F> {
+  // The checker function is extended with an third argument as the function
+  // attribute string "bool(const Function&, const Function&, const StringRef&)".
+  string AttrName = Attr;
 }
 
 def : CompatRule<"isEqual<SanitizeAddressAttr>">;
@@ -359,7 +371,9 @@ def : CompatRule<"isEqual<ShadowCallStackAttr>">;
 def : CompatRule<"isEqual<UseSampleProfileAttr>">;
 def : CompatRule<"isEqual<NoProfileAttr>">;
 def : CompatRule<"checkDenormMode">;
-
+def : CompatRuleStrAttr<"isEqual", "sign-return-address">;
+def : CompatRuleStrAttr<"isEqual", "sign-return-address-key">;
+def : CompatRuleStrAttr<"isEqual", "branch-protection-pauth-lr">;
 
 class MergeRule<string F> {
   // The name of the function called to merge the attributes of the caller and
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e6db9da5526aa3..c5f43d17d1c148 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2601,6 +2601,11 @@ def int_amdgcn_ds_bvh_stack_rtn :
     [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
   >;
 
+def int_amdgcn_s_wait_event_export_ready :
+  ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
+  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
+>;
+
 // WMMA (Wave Matrix Multiply-Accumulate) intrinsics
 //
 // These operations perform a matrix multiplication and accumulation of
@@ -2608,10 +2613,10 @@ def int_amdgcn_ds_bvh_stack_rtn :
 
 class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
   Intrinsic<
-    [CD],               // %D
+    [CD], // %D
     [
       AB,               // %A
-      AB,               // %B
+      LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
     ],
     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
@@ -2619,49 +2624,50 @@ class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
 
 class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
   Intrinsic<
-    [CD],               // %D
+    [CD], // %D
     [
       AB,               // %A
-      AB,               // %B
+      LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
-      llvm_i1_ty,       // %high
+      llvm_i1_ty,       // %high (op_sel) for GFX11, 0 for GFX12
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
   Intrinsic<
-    [CD],               // %D
+    [CD], // %D
     [
       llvm_i1_ty,       // %A_sign
       AB,               // %A
       llvm_i1_ty,       // %B_sign
-      AB,               // %B
+      LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
       llvm_i1_ty,       // %clamp
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
-def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
-// The regular, untied f16/bf16 wmma intrinsics only write to one half
-// of the registers (set via the op_sel bit).
-// The content of the other 16-bit of the registers is undefined.
-def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
-// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix
-// registers to the input accumulator registers.
-// Essentially, the content of the other 16-bit is preserved from the input.
-def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
-def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
-def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
+// WMMA GFX11Only
 
-def int_amdgcn_s_wait_event_export_ready :
-  ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
-  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
->;
+// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
+// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
+// The content of the other 16-bit half is preserved from the input.
+def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
+
+// WMMA GFX11Plus
+
+def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
+// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
+//        The content of the other 16-bit half is undefined.
+// GFX12: The op_sel bit must be 0.
+def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
 
 //===----------------------------------------------------------------------===//
 // GFX12 Intrinsics
@@ -2681,6 +2687,65 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
 
+
+// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
+//
+// These operations perform a matrix multiplication and accumulation of
+// the form: D = A * B + C .
+
+// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
+def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+// A and B are <16 x iu4>.
+def int_amdgcn_wmma_i32_16x16x32_iu4     : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
+// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
+//
+// These operations perform a sparse matrix multiplication and accumulation of
+// the form: D = A * B + C.
+// A is sparse matrix, half the size of B, and is expanded using sparsity index.
+
+class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      A,                // %A
+      B,                // %B
+      LLVMMatchType<0>, // %C
+      Index             // %Sparsity index for A
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn]
+>;
+
+class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      llvm_i1_ty,       // %A_sign
+      A,                // %A
+      llvm_i1_ty,       // %B_sign
+      B,                // %B
+      LLVMMatchType<0>, // %C
+      Index,            // %Sparsity index for A
+      llvm_i1_ty,       // %clamp
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
+>;
+
+def int_amdgcn_swmmac_f32_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf16    : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f16_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_bf16_16x16x32_bf16   : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x32_iu8     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x32_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x64_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+
 def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
 
 def int_amdgcn_flat_atomic_fmin_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
@@ -2712,6 +2777,10 @@ class AMDGPULoadTr<LLVMType ptr_ty>:
 
 def int_amdgcn_global_load_tr : AMDGPULoadTr<global_ptr_ty>;
 
+// i32 @llvm.amdgcn.wave.id()
+def int_amdgcn_wave_id :
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
+
 //===----------------------------------------------------------------------===//
 // Deep learning intrinsics.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Object/COFF.h b/llvm/include/llvm/Object/COFF.h
index a548b2c15c5fdc..2a5c3d8913b15c 100644
--- a/llvm/include/llvm/Object/COFF.h
+++ b/llvm/include/llvm/Object/COFF.h
@@ -1362,6 +1362,47 @@ class SectionStrippedError
   SectionStrippedError() { setErrorCode(object_error::section_stripped); }
 };
 
+inline std::optional<std::string>
+getArm64ECMangledFunctionName(StringRef Name) {
+  bool IsCppFn = Name[0] == '?';
+  if (IsCppFn && Name.find("$$h") != std::string::npos)
+    return std::nullopt;
+  if (!IsCppFn && Name[0] == '#')
+    return std::nullopt;
+
+  StringRef Prefix = "$$h";
+  size_t InsertIdx = 0;
+  if (IsCppFn) {
+    InsertIdx = Name.find("@@");
+    size_t ThreeAtSignsIdx = Name.find("@@@");
+    if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) {
+      InsertIdx += 2;
+    } else {
+      InsertIdx = Name.find("@");
+      if (InsertIdx != std::string::npos)
+        InsertIdx++;
+    }
+  } else {
+    Prefix = "#";
+  }
+
+  return std::optional<std::string>(
+      (Name.substr(0, InsertIdx) + Prefix + Name.substr(InsertIdx)).str());
+}
+
+inline std::optional<std::string>
+getArm64ECDemangledFunctionName(StringRef Name) {
+  if (Name[0] == '#')
+    return std::string(Name.substr(1));
+  if (Name[0] != '?')
+    return std::nullopt;
+
+  std::pair<StringRef, StringRef> Pair = Name.split("$$h");
+  if (Pair.second.empty())
+    return std::nullopt;
+  return (Pair.first + Pair.second).str();
+}
+
 } // end namespace object
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h
index edc836ff0348cb..8358197309f000 100644
--- a/llvm/include/llvm/Object/COFFImportFile.h
+++ b/llvm/include/llvm/Object/COFFImportFile.h
@@ -26,7 +26,16 @@
 namespace llvm {
 namespace object {
 
+constexpr std::string_view ImportDescriptorPrefix = "__IMPORT_DESCRIPTOR_";
+constexpr std::string_view NullImportDescriptorSymbolName =
+    "__NULL_IMPORT_DESCRIPTOR";
+constexpr std::string_view NullThunkDataPrefix = "\x7f";
+constexpr std::string_view NullThunkDataSuffix = "_NULL_THUNK_DATA";
+
 class COFFImportFile : public SymbolicFile {
+private:
+  enum SymbolIndex { ImpSymbol, ThunkSymbol, ECAuxSymbol, ECThunkSymbol };
+
 public:
   COFFImportFile(MemoryBufferRef Source)
       : SymbolicFile(ID_COFFImportFile, Source) {}
@@ -36,9 +45,23 @@ class COFFImportFile : public SymbolicFile {
   void moveSymbolNext(DataRefImpl &Symb) const override { ++Symb.p; }
 
   Error printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override {
-    if (Symb.p == 0)
+    switch (Symb.p) {
+    case ImpSymbol:
       OS << "__imp_";
-    OS << StringRef(Data.getBufferStart() + sizeof(coff_import_header));
+      break;
+    case ECAuxSymbol:
+      OS << "__imp_aux_";
+      break;
+    }
+    const char *Name = Data.getBufferStart() + sizeof(coff_import_header);
+    if (Symb.p != ECThunkSymbol && COFF::isArm64EC(getMachine())) {
+      if (std::optional<std::string> DemangledName =
+              getArm64ECDemangledFunctionName(Name)) {
+        OS << StringRef(*DemangledName);
+        return Error::success();
+      }
+    }
+    OS << StringRef(Name);
     return Error::success();
   }
 
@@ -52,7 +75,12 @@ class COFFImportFile : public SymbolicFile {
 
   basic_symbol_iterator symbol_end() const override {
     DataRefImpl Symb;
-    Symb.p = isData() ? 1 : 2;
+    if (isData())
+      Symb.p = ImpSymbol + 1;
+    else if (COFF::isArm64EC(getMachine()))
+      Symb.p = ECThunkSymbol + 1;
+    else
+      Symb.p = ThunkSymbol + 1;
     return BasicSymbolRef(Symb, this);
   }
 
@@ -66,6 +94,7 @@ class COFFImportFile : public SymbolicFile {
   uint16_t getMachine() const { return getCOFFImportHeader()->Machine; }
 
   StringRef getFileFormatName() const;
+  StringRef getExportName() const;
 
 private:
   bool isData() const {
diff --git a/llvm/include/llvm/Support/FormattedStream.h b/llvm/include/llvm/Support/FormattedStream.h
index 5f937cfa798408..850a18dbb94121 100644
--- a/llvm/include/llvm/Support/FormattedStream.h
+++ b/llvm/include/llvm/Support/FormattedStream.h
@@ -52,6 +52,10 @@ class formatted_raw_ostream : public raw_ostream {
   /// have the rest of it.
   SmallString<4> PartialUTF8Char;
 
+  /// DisableScan - Temporarily disable scanning of output. Used to ignore color
+  /// codes.
+  bool DisableScan;
+
   void write_impl(const char *Ptr, size_t Size) override;
 
   /// current_pos - Return the current position within the stream,
@@ -89,9 +93,33 @@ class formatted_raw_ostream : public raw_ostream {
       SetUnbuffered();
     TheStream->SetUnbuffered();
 
+    enable_colors(TheStream->colors_enabled());
+
     Scanned = nullptr;
   }
 
+  void PreDisableScan() {
+    assert(!DisableScan);
+    ComputePosition(getBufferStart(), GetNumBytesInBuffer());
+    assert(PartialUTF8Char.empty());
+    DisableScan = true;
+  }
+
+  void PostDisableScan() {
+    assert(DisableScan);
+    DisableScan = false;
+    Scanned = getBufferStart() + GetNumBytesInBuffer();
+  }
+
+  struct DisableScanScope {
+    formatted_raw_ostream *S;
+
+    DisableScanScope(formatted_raw_ostream *FRO) : S(FRO) {
+      S->PreDisableScan();
+    }
+    ~DisableScanScope() { S->PostDisableScan(); }
+  };
+
 public:
   /// formatted_raw_ostream - Open the specified file for
   /// writing. If an error occurs, information about the error is
@@ -104,12 +132,12 @@ class formatted_raw_ostream : public raw_ostream {
   /// underneath it.
   ///
   formatted_raw_ostream(raw_ostream &Stream)
-      : TheStream(nullptr), Position(0, 0) {
+      : TheStream(nullptr), Position(0, 0), DisableScan(false) {
     setStream(Stream);
   }
-  explicit formatted_raw_ostream() : TheStream(nullptr), Position(0, 0) {
-    Scanned = nullptr;
-  }
+  explicit formatted_raw_ostream()
+      : TheStream(nullptr), Position(0, 0), Scanned(nullptr),
+        DisableScan(false) {}
 
   ~formatted_raw_ostream() override {
     flush();
@@ -136,17 +164,26 @@ class formatted_raw_ostream : public raw_ostream {
   }
 
   raw_ostream &resetColor() override {
-    TheStream->resetColor();
+    if (colors_enabled()) {
+      DisableScanScope S(this);
+      raw_ostream::resetColor();
+    }
     return *this;
   }
 
   raw_ostream &reverseColor() override {
-    TheStream->reverseColor();
+    if (colors_enabled()) {
+      DisableScanScope S(this);
+      raw_ostream::reverseColor();
+    }
     return *this;
   }
 
   raw_ostream &changeColor(enum Colors Color, bool Bold, bool BG) override {
-    TheStream->changeColor(Color, Bold, BG);
+    if (colors_enabled()) {
+      DisableScanScope S(this);
+      raw_ostream::changeColor(Color, Bold, BG);
+    }
     return *this;
   }
 
diff --git a/llvm/include/llvm/Support/X86FoldTablesUtils.h b/llvm/include/llvm/Support/X86FoldTablesUtils.h
index ed244febc38d3a..77d32cc7fb37ed 100644
--- a/llvm/include/llvm/Support/X86FoldTablesUtils.h
+++ b/llvm/include/llvm/Support/X86FoldTablesUtils.h
@@ -46,11 +46,12 @@ enum {
   // Broadcast type.
   // (stored in bits 12 - 14)
   TB_BCAST_TYPE_SHIFT = TB_ALIGN_SHIFT + 3,
-  TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT,
-  TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT,
-  TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT,
-  TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT,
-  TB_BCAST_SH = 4 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_W = 0 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_D = 1 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_Q = 2 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_SS = 3 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_SD = 4 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_SH = 5 << TB_BCAST_TYPE_SHIFT,
   TB_BCAST_MASK = 0x7 << TB_BCAST_TYPE_SHIFT,
 
   // Unused bits 15-16
diff --git a/llvm/include/llvm/Target/TargetInstrPredicate.td b/llvm/include/llvm/Target/TargetInstrPredicate.td
index 82c4c7b23a49b6..b5419cb9f3867f 100644
--- a/llvm/include/llvm/Target/TargetInstrPredicate.td
+++ b/llvm/include/llvm/Target/TargetInstrPredicate.td
@@ -152,6 +152,34 @@ class CheckImmOperand_s<int Index, string Value> : CheckOperandBase<Index> {
   string ImmVal = Value;
 }
 
+// Check that the operand at position `Index` is less than `Imm`.
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperandLT<int Index, int Imm> : CheckOperandBase<Index> {
+  int ImmVal = Imm;
+}
+
+// Check that the operand at position `Index` is greater than `Imm`.
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperandGT<int Index, int Imm> : CheckOperandBase<Index> {
+  int ImmVal = Imm;
+}
+
+// Check that the operand at position `Index` is less than or equal to `Imm`.
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperandLE<int Index, int Imm> : CheckNot<CheckImmOperandGT<Index, Imm>>;
+
+// Check that the operand at position `Index` is greater than or equal to `Imm`.
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperandGE<int Index, int Imm> : CheckNot<CheckImmOperandLT<Index, Imm>>;
+
 // Expands to a call to `FunctionMapper` if field `FunctionMapper` is set.
 // Otherwise, it expands to a CheckNot<CheckInvalidRegOperand<Index>>.
 class CheckRegOperandSimple<int Index> : CheckOperandBase<Index>;
@@ -203,6 +231,12 @@ class CheckAll<list<MCInstPredicate> Sequence>
 class CheckAny<list<MCInstPredicate> Sequence>
     : CheckPredicateSequence<Sequence>;
 
+// Check that the operand at position `Index` is in range [Start, End].
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against range [Start, End].
+class CheckImmOperandRange<int Index, int Start, int End>
+  : CheckAll<[CheckImmOperandGE<Index, Start>, CheckImmOperandLE<Index, End>]>;
 
 // Used to expand the body of a function predicate. See the definition of
 // TIIPredicate below.
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index 032de728517827..40c2cce8c6effe 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -620,7 +620,7 @@ class SecondFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
   : FusionPredicateWithMCInstPredicate<second_fusion_target, pred>;
 // The pred will be applied on both firstMI and secondMI.
 class BothFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
-  : FusionPredicateWithMCInstPredicate<second_fusion_target, pred>;
+  : FusionPredicateWithMCInstPredicate<both_fusion_target, pred>;
 
 // Tie firstOpIdx and secondOpIdx. The operand of `FirstMI` at position
 // `firstOpIdx` should be the same as the operand of `SecondMI` at position
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 623fdc21ba65a6..c10f92e2871747 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -478,7 +478,7 @@ inline constexpr ArchInfo ARMV8_1A  = { VersionTuple{8, 1}, AProfile, "armv8.1-a
 inline constexpr ArchInfo ARMV8_2A  = { VersionTuple{8, 2}, AProfile, "armv8.2-a", "+v8.2a", (ARMV8_1A.DefaultExts |
                                         AArch64::ExtensionBitset({AArch64::AEK_RAS}))};
 inline constexpr ArchInfo ARMV8_3A  = { VersionTuple{8, 3}, AProfile, "armv8.3-a", "+v8.3a", (ARMV8_2A.DefaultExts |
-                                        AArch64::ExtensionBitset({AArch64::AEK_RCPC, AArch64::AEK_JSCVT, AArch64::AEK_FCMA}))};
+                                        AArch64::ExtensionBitset({AArch64::AEK_FCMA, AArch64::AEK_JSCVT, AArch64::AEK_PAUTH, AArch64::AEK_RCPC}))};
 inline constexpr ArchInfo ARMV8_4A  = { VersionTuple{8, 4}, AProfile, "armv8.4-a", "+v8.4a", (ARMV8_3A.DefaultExts |
                                         AArch64::ExtensionBitset({AArch64::AEK_DOTPROD}))};
 inline constexpr ArchInfo ARMV8_5A  = { VersionTuple{8, 5}, AProfile, "armv8.5-a", "+v8.5a", (ARMV8_4A.DefaultExts)};
@@ -805,6 +805,12 @@ inline constexpr CpuInfo CpuInfos[] = {
          {AArch64::AEK_FP16, AArch64::AEK_RAND, AArch64::AEK_SM4,
           AArch64::AEK_SHA3, AArch64::AEK_SHA2, AArch64::AEK_AES,
           AArch64::AEK_MTE, AArch64::AEK_SB, AArch64::AEK_SSBS}))},
+    {"ampere1b", ARMV8_7A,
+     (AArch64::ExtensionBitset({AArch64::AEK_FP16, AArch64::AEK_RAND,
+                                AArch64::AEK_SM4, AArch64::AEK_SHA3,
+                                AArch64::AEK_SHA2, AArch64::AEK_AES,
+                                AArch64::AEK_MTE, AArch64::AEK_SB,
+                                AArch64::AEK_SSBS, AArch64::AEK_CSSC}))},
 };
 
 // An alias for a CPU.
@@ -813,7 +819,8 @@ struct CpuAlias {
   StringRef Name;
 };
 
-inline constexpr CpuAlias CpuAliases[] = {{"grace", "neoverse-v2"}};
+inline constexpr CpuAlias CpuAliases[] = {{"cobalt-100", "neoverse-n2"},
+                                          {"grace", "neoverse-v2"}};
 
 bool getExtensionFeatures(
     const AArch64::ExtensionBitset &Extensions,
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 870dc75b1c1f80..49ec8de9c528de 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -1033,11 +1033,11 @@ class Triple {
            isWindowsCygwinEnvironment() || isOHOSFamily();
   }
 
-  /// Tests whether the target uses TLS Descriptor by default.
+  /// True if the target supports both general-dynamic and TLSDESC, and TLSDESC
+  /// is enabled by default.
   bool hasDefaultTLSDESC() const {
     // TODO: Improve check for other platforms, like Android, and RISC-V
-    // Note: This is currently only used on RISC-V.
-    return isOSBinFormatELF() && isAArch64();
+    return false;
   }
 
   /// Tests whether the target uses -data-sections as default.
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 3178e2d2781674..1028b52a79123f 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -89,7 +89,7 @@ bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA,
   // may be created without handles to some analyses and in that case don't
   // depend on them.
   if (Inv.invalidate<AssumptionAnalysis>(Fn, PA) ||
-      (DT && Inv.invalidate<DominatorTreeAnalysis>(Fn, PA)))
+      (DT_ && Inv.invalidate<DominatorTreeAnalysis>(Fn, PA)))
     return true;
 
   // Otherwise this analysis result remains valid.
@@ -1063,6 +1063,7 @@ AliasResult BasicAAResult::aliasGEP(
                                              : AliasResult::MayAlias;
   }
 
+  DominatorTree *DT = getDT(AAQI);
   DecomposedGEP DecompGEP1 = DecomposeGEPExpression(GEP1, DL, &AC, DT);
   DecomposedGEP DecompGEP2 = DecomposeGEPExpression(V2, DL, &AC, DT);
 
@@ -1556,6 +1557,7 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
         const Value *HintO1 = getUnderlyingObject(Hint1);
         const Value *HintO2 = getUnderlyingObject(Hint2);
 
+        DominatorTree *DT = getDT(AAQI);
         auto ValidAssumeForPtrContext = [&](const Value *Ptr) {
           if (const Instruction *PtrI = dyn_cast<Instruction>(Ptr)) {
             return isValidAssumeForContext(Assume, PtrI, DT,
@@ -1735,7 +1737,7 @@ bool BasicAAResult::isValueEqualInPotentialCycles(const Value *V,
   if (!Inst || Inst->getParent()->isEntryBlock())
     return true;
 
-  return isNotInCycle(Inst, DT, /*LI*/ nullptr);
+  return isNotInCycle(Inst, getDT(AAQI), /*LI*/ nullptr);
 }
 
 /// Computes the symbolic difference between two de-composed GEPs.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index d0c27cae0dff99..72b6dfa181e86d 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -439,7 +439,8 @@ static Value *threadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
     // Check that the simplified value has the form "X op Y" where "op" is the
     // same as the original operation.
     Instruction *Simplified = dyn_cast<Instruction>(FV ? FV : TV);
-    if (Simplified && Simplified->getOpcode() == unsigned(Opcode)) {
+    if (Simplified && Simplified->getOpcode() == unsigned(Opcode) &&
+        !Simplified->hasPoisonGeneratingFlags()) {
       // The value that didn't simplify is "UnsimplifiedLHS op UnsimplifiedRHS".
       // We already know that "op" is the same as for the simplified value.  See
       // if the operands match too.  If so, return the simplified value.
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 1ebc593016bc0d..16635097d20afe 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -657,11 +657,12 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
     BasicBlock::iterator BBI = L->getIterator();
     BasicBlock *BB = L->getParent();
     SmallPtrSet<BasicBlock *, 4> VisitedBlocks;
+    BatchAAResults BatchAA(*AA);
     for (;;) {
       if (!VisitedBlocks.insert(BB).second)
         break;
       if (Value *U =
-              FindAvailableLoadedValue(L, BB, BBI, DefMaxInstsToScan, AA))
+              FindAvailableLoadedValue(L, BB, BBI, DefMaxInstsToScan, &BatchAA))
         return findValueImpl(U, OffsetOk, Visited);
       if (BBI != BB->begin())
         break;
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 97d21db86abf28..5916d2ab48ecec 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -364,7 +364,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
 
   if (Size.getBitWidth() > 64)
     return false;
-  const uint64_t LoadSize = Size.getZExtValue();
+  const TypeSize LoadSize = TypeSize::getFixed(Size.getZExtValue());
 
   // Otherwise, be a little bit aggressive by scanning the local block where we
   // want to check to see if the pointer is already being loaded or stored
@@ -414,11 +414,11 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
 
     // Handle trivial cases.
     if (AccessedPtr == V &&
-        LoadSize <= DL.getTypeStoreSize(AccessedTy))
+        TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy)))
       return true;
 
     if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) &&
-        LoadSize <= DL.getTypeStoreSize(AccessedTy))
+        TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy)))
       return true;
   }
   return false;
@@ -450,11 +450,10 @@ llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden,
            "to scan backward from a given instruction, when searching for "
            "available loaded value"));
 
-Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
-                                      BasicBlock *ScanBB,
+Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
                                       BasicBlock::iterator &ScanFrom,
                                       unsigned MaxInstsToScan,
-                                      AAResults *AA, bool *IsLoad,
+                                      BatchAAResults *AA, bool *IsLoad,
                                       unsigned *NumScanedInst) {
   // Don't CSE load that is volatile or anything stronger than unordered.
   if (!Load->isUnordered())
@@ -583,7 +582,7 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
 Value *llvm::findAvailablePtrLoadStore(
     const MemoryLocation &Loc, Type *AccessTy, bool AtLeastAtomic,
     BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, unsigned MaxInstsToScan,
-    AAResults *AA, bool *IsLoadCSE, unsigned *NumScanedInst) {
+    BatchAAResults *AA, bool *IsLoadCSE, unsigned *NumScanedInst) {
   if (MaxInstsToScan == 0)
     MaxInstsToScan = ~0U;
 
@@ -664,7 +663,7 @@ Value *llvm::findAvailablePtrLoadStore(
   return nullptr;
 }
 
-Value *llvm::FindAvailableLoadedValue(LoadInst *Load, AAResults &AA,
+Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
                                       bool *IsLoadCSE,
                                       unsigned MaxInstsToScan) {
   const DataLayout &DL = Load->getModule()->getDataLayout();
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 7e67c90152829d..dd6b88fee415a7 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -657,16 +657,18 @@ class AccessAnalysis {
 
   AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
                  MemoryDepChecker::DepCandidates &DA,
-                 PredicatedScalarEvolution &PSE)
-      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE) {
+                 PredicatedScalarEvolution &PSE,
+                 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
+      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
+        LoopAliasScopes(LoopAliasScopes) {
     // We're analyzing dependences across loop iterations.
     BAA.enableCrossIterationMode();
   }
 
   /// Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
-    Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Loc.getWithNewSize(LocationSize::beforeOrAfterPointer()));
+    Value *Ptr = const_cast<Value *>(Loc.Ptr);
+    AST.add(adjustLoc(Loc));
     Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
     if (IsReadOnly)
       ReadOnlyPtr.insert(Ptr);
@@ -674,8 +676,8 @@ class AccessAnalysis {
 
   /// Register a store.
   void addStore(MemoryLocation &Loc, Type *AccessTy) {
-    Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Loc.getWithNewSize(LocationSize::beforeOrAfterPointer()));
+    Value *Ptr = const_cast<Value *>(Loc.Ptr);
+    AST.add(adjustLoc(Loc));
     Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
   }
 
@@ -731,6 +733,32 @@ class AccessAnalysis {
 private:
   typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
 
+  /// Adjust the MemoryLocation so that it represents accesses to this
+  /// location across all iterations, rather than a single one.
+  MemoryLocation adjustLoc(MemoryLocation Loc) const {
+    // The accessed location varies within the loop, but remains within the
+    // underlying object.
+    Loc.Size = LocationSize::beforeOrAfterPointer();
+    Loc.AATags.Scope = adjustAliasScopeList(Loc.AATags.Scope);
+    Loc.AATags.NoAlias = adjustAliasScopeList(Loc.AATags.NoAlias);
+    return Loc;
+  }
+
+  /// Drop alias scopes that are only valid within a single loop iteration.
+  MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
+    if (!ScopeList)
+      return nullptr;
+
+    // For the sake of simplicity, drop the whole scope list if any scope is
+    // iteration-local.
+    if (any_of(ScopeList->operands(), [&](Metadata *Scope) {
+          return LoopAliasScopes.contains(cast<MDNode>(Scope));
+        }))
+      return nullptr;
+
+    return ScopeList;
+  }
+
   /// Go over all memory access and check whether runtime pointer checks
   /// are needed and build sets of dependency check candidates.
   void processMemAccesses();
@@ -775,6 +803,10 @@ class AccessAnalysis {
   PredicatedScalarEvolution &PSE;
 
   DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects;
+
+  /// Alias scopes that are declared inside the loop, and as such not valid
+  /// across iterations.
+  SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
 };
 
 } // end anonymous namespace
@@ -2283,6 +2315,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   // Holds the Load and Store instructions.
   SmallVector<LoadInst *, 16> Loads;
   SmallVector<StoreInst *, 16> Stores;
+  SmallPtrSet<MDNode *, 8> LoopAliasScopes;
 
   // Holds all the different accesses in the loop.
   unsigned NumReads = 0;
@@ -2326,6 +2359,11 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       if (HasComplexMemInst)
         continue;
 
+      // Record alias scopes defined inside the loop.
+      if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+        for (Metadata *Op : Decl->getScopeList()->operands())
+          LoopAliasScopes.insert(cast<MDNode>(Op));
+
       // Many math library functions read the rounding mode. We will only
       // vectorize a loop if it contains known function calls that don't set
       // the flag. Therefore, it is safe to ignore this read from memory.
@@ -2407,7 +2445,8 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   }
 
   MemoryDepChecker::DepCandidates DependentAccesses;
-  AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE);
+  AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE,
+                          LoopAliasScopes);
 
   // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp
index e87ae7d71fffe2..aa550f0b6a7bfd 100644
--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -692,25 +692,9 @@ void MemorySSAUpdater::updateForClonedLoop(const LoopBlocksRPO &LoopBlocks,
         continue;
 
       // Determine incoming value and add it as incoming from IncBB.
-      if (MemoryUseOrDef *IncMUD = dyn_cast<MemoryUseOrDef>(IncomingAccess)) {
-        if (!MSSA->isLiveOnEntryDef(IncMUD)) {
-          Instruction *IncI = IncMUD->getMemoryInst();
-          assert(IncI && "Found MemoryUseOrDef with no Instruction.");
-          if (Instruction *NewIncI =
-                  cast_or_null<Instruction>(VMap.lookup(IncI))) {
-            IncMUD = MSSA->getMemoryAccess(NewIncI);
-            assert(IncMUD &&
-                   "MemoryUseOrDef cannot be null, all preds processed.");
-          }
-        }
-        NewPhi->addIncoming(IncMUD, IncBB);
-      } else {
-        MemoryPhi *IncPhi = cast<MemoryPhi>(IncomingAccess);
-        if (MemoryAccess *NewDefPhi = MPhiMap.lookup(IncPhi))
-          NewPhi->addIncoming(NewDefPhi, IncBB);
-        else
-          NewPhi->addIncoming(IncPhi, IncBB);
-      }
+      NewPhi->addIncoming(
+          getNewDefiningAccessForClone(IncomingAccess, VMap, MPhiMap, MSSA),
+          IncBB);
     }
     if (auto *SingleAccess = onlySingleValue(NewPhi)) {
       MPhiMap[Phi] = SingleAccess;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 2acb45837c480a..4b2db80bc1ec30 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -4184,6 +4184,68 @@ void ScalarEvolution::getPoisonGeneratingValues(
     Result.insert(SU->getValue());
 }
 
+bool ScalarEvolution::canReuseInstruction(
+    const SCEV *S, Instruction *I,
+    SmallVectorImpl<Instruction *> &DropPoisonGeneratingInsts) {
+  // If the instruction cannot be poison, it's always safe to reuse.
+  if (programUndefinedIfPoison(I))
+    return true;
+
+  // Otherwise, it is possible that I is more poisonous that S. Collect the
+  // poison-contributors of S, and then check whether I has any additional
+  // poison-contributors. Poison that is contributed through poison-generating
+  // flags is handled by dropping those flags instead.
+  SmallPtrSet<const Value *, 8> PoisonVals;
+  getPoisonGeneratingValues(PoisonVals, S);
+
+  SmallVector<Value *> Worklist;
+  SmallPtrSet<Value *, 8> Visited;
+  Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+    if (!Visited.insert(V).second)
+      continue;
+
+    // Avoid walking large instruction graphs.
+    if (Visited.size() > 16)
+      return false;
+
+    // Either the value can't be poison, or the S would also be poison if it
+    // is.
+    if (PoisonVals.contains(V) || isGuaranteedNotToBePoison(V))
+      continue;
+
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return false;
+
+    // Disjoint or instructions are interpreted as adds by SCEV. However, we
+    // can't replace an arbitrary add with disjoint or, even if we drop the
+    // flag. We would need to convert the or into an add.
+    if (auto *PDI = dyn_cast<PossiblyDisjointInst>(I))
+      if (PDI->isDisjoint())
+        return false;
+
+    // FIXME: Ignore vscale, even though it technically could be poison. Do this
+    // because SCEV currently assumes it can't be poison. Remove this special
+    // case once we proper model when vscale can be poison.
+    if (auto *II = dyn_cast<IntrinsicInst>(I);
+        II && II->getIntrinsicID() == Intrinsic::vscale)
+      continue;
+
+    if (canCreatePoison(cast<Operator>(I), /*ConsiderFlagsAndMetadata*/ false))
+      return false;
+
+    // If the instruction can't create poison, we can recurse to its operands.
+    if (I->hasPoisonGeneratingFlagsOrMetadata())
+      DropPoisonGeneratingInsts.push_back(I);
+
+    for (Value *Op : I->operands())
+      Worklist.push_back(Op);
+  }
+  return true;
+}
+
 const SCEV *
 ScalarEvolution::getSequentialMinMaxExpr(SCEVTypes Kind,
                                          SmallVectorImpl<const SCEV *> &Ops) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 5d6c3465a0c364..9f9451e4e814ac 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5083,8 +5083,13 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
         Op->getOperand(0)->getType()->getScalarType()->getFltSemantics();
 
     // All subnormal inputs should be in the normal range in the result type.
-    if (APFloat::isRepresentableAsNormalIn(SrcTy, DstTy))
+    if (APFloat::isRepresentableAsNormalIn(SrcTy, DstTy)) {
+      if (Known.KnownFPClasses & fcPosSubnormal)
+        Known.KnownFPClasses |= fcPosNormal;
+      if (Known.KnownFPClasses & fcNegSubnormal)
+        Known.KnownFPClasses |= fcNegNormal;
       Known.knownNot(fcSubnormal);
+    }
 
     // Sign bit of a nan isn't guaranteed.
     if (!Known.isKnownNeverNaN())
@@ -5981,6 +5986,8 @@ void llvm::getUnderlyingObjects(const Value *V,
       if (!LI || !LI->isLoopHeader(PN->getParent()) ||
           isSameUnderlyingObjectInLoop(PN, LI))
         append_range(Worklist, PN->incoming_values());
+      else
+        Objects.push_back(P);
       continue;
     }
 
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 73facc76a92b2c..bf7bc0ba84a033 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1012,6 +1012,31 @@ bool llvm::maskIsAllOneOrUndef(Value *Mask) {
   return true;
 }
 
+bool llvm::maskContainsAllOneOrUndef(Value *Mask) {
+  assert(isa<VectorType>(Mask->getType()) &&
+         isa<IntegerType>(Mask->getType()->getScalarType()) &&
+         cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
+             1 &&
+         "Mask must be a vector of i1");
+
+  auto *ConstMask = dyn_cast<Constant>(Mask);
+  if (!ConstMask)
+    return false;
+  if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
+    return true;
+  if (isa<ScalableVectorType>(ConstMask->getType()))
+    return false;
+  for (unsigned
+           I = 0,
+           E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
+       I != E; ++I) {
+    if (auto *MaskElt = ConstMask->getAggregateElement(I))
+      if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
+        return true;
+  }
+  return false;
+}
+
 /// TODO: This is a lot like known bits, but for
 /// vectors.  Is there something we can common this with?
 APInt llvm::possiblyDemandedEltsInMask(Value *Mask) {
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index a9f78358e57b92..ecf7bc30913f51 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -2048,8 +2048,10 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   FBB->erase(FBB->begin(), FIB);
 
   if (UpdateLiveIns) {
-    recomputeLiveIns(*TBB);
-    recomputeLiveIns(*FBB);
+    bool anyChange = false;
+    do {
+      anyChange = recomputeLiveIns(*TBB) || recomputeLiveIns(*FBB);
+    } while (anyChange);
   }
 
   ++NumHoist;
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 8ee1f19e083e4e..1cca56fc19cfd8 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8154,6 +8154,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
       IRBuilder<> Builder(Branch);
       if (UI->getParent() != Branch->getParent())
         UI->moveBefore(Branch);
+      UI->dropPoisonGeneratingFlags();
       Value *NewCmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, UI,
                                         ConstantInt::get(UI->getType(), 0));
       LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
@@ -8167,6 +8168,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
       IRBuilder<> Builder(Branch);
       if (UI->getParent() != Branch->getParent())
         UI->moveBefore(Branch);
+      UI->dropPoisonGeneratingFlags();
       Value *NewCmp = Builder.CreateCmp(Cmp->getPredicate(), UI,
                                         ConstantInt::get(UI->getType(), 0));
       LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 772229215e798d..61ddc858ba44c7 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -591,8 +591,8 @@ bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
         UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
         (UseMI.getOpcode() == TargetOpcode::G_ANYEXT)) {
       const auto &MMO = LoadMI->getMMO();
-      // For atomics, only form anyextending loads.
-      if (MMO.isAtomic() && UseMI.getOpcode() != TargetOpcode::G_ANYEXT)
+      // Don't do anything for atomics.
+      if (MMO.isAtomic())
         continue;
       // Check for legality.
       if (!isPreLegalize()) {
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 3b2cf319109273..47d045ac48171e 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -596,6 +596,8 @@ llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
                     LostDebugLocObserver &LocObserver, MachineInstr *MI) {
   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
   const char *Name = TLI.getLibcallName(Libcall);
+  if (!Name)
+    return LegalizerHelper::UnableToLegalize;
   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
   return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
 }
@@ -4178,6 +4180,10 @@ LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
     }
   }
 
+  // Set the insert point after the existing PHIs
+  MachineBasicBlock &MBB = *MI.getParent();
+  MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
+
   // Merge small outputs into MI's def.
   if (NumLeftovers) {
     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index 246aa88b09acf6..ee499c41c558c3 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -84,21 +84,20 @@ BaseIndexOffset GISelAddressing::getPointerInfo(Register Ptr,
                                                 MachineRegisterInfo &MRI) {
   BaseIndexOffset Info;
   Register PtrAddRHS;
-  if (!mi_match(Ptr, MRI, m_GPtrAdd(m_Reg(Info.BaseReg), m_Reg(PtrAddRHS)))) {
-    Info.BaseReg = Ptr;
-    Info.IndexReg = Register();
-    Info.IsIndexSignExt = false;
+  Register BaseReg;
+  if (!mi_match(Ptr, MRI, m_GPtrAdd(m_Reg(BaseReg), m_Reg(PtrAddRHS)))) {
+    Info.setBase(Ptr);
+    Info.setOffset(0);
     return Info;
   }
-
+  Info.setBase(BaseReg);
   auto RHSCst = getIConstantVRegValWithLookThrough(PtrAddRHS, MRI);
   if (RHSCst)
-    Info.Offset = RHSCst->Value.getSExtValue();
+    Info.setOffset(RHSCst->Value.getSExtValue());
 
   // Just recognize a simple case for now. In future we'll need to match
   // indexing patterns for base + index + constant.
-  Info.IndexReg = PtrAddRHS;
-  Info.IsIndexSignExt = false;
+  Info.setIndex(PtrAddRHS);
   return Info;
 }
 
@@ -114,15 +113,16 @@ bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1,
   BaseIndexOffset BasePtr0 = getPointerInfo(LdSt1->getPointerReg(), MRI);
   BaseIndexOffset BasePtr1 = getPointerInfo(LdSt2->getPointerReg(), MRI);
 
-  if (!BasePtr0.BaseReg.isValid() || !BasePtr1.BaseReg.isValid())
+  if (!BasePtr0.getBase().isValid() || !BasePtr1.getBase().isValid())
     return false;
 
   int64_t Size1 = LdSt1->getMemSize();
   int64_t Size2 = LdSt2->getMemSize();
 
   int64_t PtrDiff;
-  if (BasePtr0.BaseReg == BasePtr1.BaseReg) {
-    PtrDiff = BasePtr1.Offset - BasePtr0.Offset;
+  if (BasePtr0.getBase() == BasePtr1.getBase() && BasePtr0.hasValidOffset() &&
+      BasePtr1.hasValidOffset()) {
+    PtrDiff = BasePtr1.getOffset() - BasePtr0.getOffset();
     // If the size of memory access is unknown, do not use it to do analysis.
     // One example of unknown size memory access is to load/store scalable
     // vector objects on the stack.
@@ -151,8 +151,8 @@ bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1,
   // able to calculate their relative offset if at least one arises
   // from an alloca. However, these allocas cannot overlap and we
   // can infer there is no alias.
-  auto *Base0Def = getDefIgnoringCopies(BasePtr0.BaseReg, MRI);
-  auto *Base1Def = getDefIgnoringCopies(BasePtr1.BaseReg, MRI);
+  auto *Base0Def = getDefIgnoringCopies(BasePtr0.getBase(), MRI);
+  auto *Base1Def = getDefIgnoringCopies(BasePtr1.getBase(), MRI);
   if (!Base0Def || !Base1Def)
     return false; // Couldn't tell anything.
 
@@ -520,16 +520,20 @@ bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI,
 
   Register StoreAddr = StoreMI.getPointerReg();
   auto BIO = getPointerInfo(StoreAddr, *MRI);
-  Register StoreBase = BIO.BaseReg;
-  uint64_t StoreOffCst = BIO.Offset;
+  Register StoreBase = BIO.getBase();
   if (C.Stores.empty()) {
+    C.BasePtr = StoreBase;
+    if (!BIO.hasValidOffset()) {
+      C.CurrentLowestOffset = 0;
+    } else {
+      C.CurrentLowestOffset = BIO.getOffset();
+    }
     // This is the first store of the candidate.
     // If the offset can't possibly allow for a lower addressed store with the
     // same base, don't bother adding it.
-    if (StoreOffCst < ValueTy.getSizeInBytes())
+    if (BIO.hasValidOffset() &&
+        BIO.getOffset() < static_cast<int64_t>(ValueTy.getSizeInBytes()))
       return false;
-    C.BasePtr = StoreBase;
-    C.CurrentLowestOffset = StoreOffCst;
     C.Stores.emplace_back(&StoreMI);
     LLVM_DEBUG(dbgs() << "Starting a new merge candidate group with: "
                       << StoreMI);
@@ -549,8 +553,12 @@ bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI,
   // writes to the next lowest adjacent address.
   if (C.BasePtr != StoreBase)
     return false;
-  if ((C.CurrentLowestOffset - ValueTy.getSizeInBytes()) !=
-      static_cast<uint64_t>(StoreOffCst))
+  // If we don't have a valid offset, we can't guarantee to be an adjacent
+  // offset.
+  if (!BIO.hasValidOffset())
+    return false;
+  if ((C.CurrentLowestOffset -
+       static_cast<int64_t>(ValueTy.getSizeInBytes())) != BIO.getOffset())
     return false;
 
   // This writes to an adjacent address. Allow it.
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index cbb1a74049fbd7..7e9c992031f8d3 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -236,7 +236,8 @@ namespace {
     /// was successfully coalesced away. If it is not currently possible to
     /// coalesce this interval, but it may be possible if other things get
     /// coalesced, then it returns true by reference in 'Again'.
-    bool joinCopy(MachineInstr *CopyMI, bool &Again);
+    bool joinCopy(MachineInstr *CopyMI, bool &Again,
+                  SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs);
 
     /// Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
@@ -1964,7 +1965,9 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
   LIS->shrinkToUses(&LI);
 }
 
-bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
+bool RegisterCoalescer::joinCopy(
+    MachineInstr *CopyMI, bool &Again,
+    SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) {
   Again = false;
   LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
 
@@ -2156,7 +2159,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   // CopyMI has been erased by joinIntervals at this point. Remove it from
   // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
   // to the work list. This keeps ErasedInstrs from growing needlessly.
-  ErasedInstrs.erase(CopyMI);
+  if (ErasedInstrs.erase(CopyMI))
+    // But we may encounter the instruction again in this iteration.
+    CurrentErasedInstrs.insert(CopyMI);
 
   // Rewrite all SrcReg operands to DstReg.
   // Also update DstReg operands to include DstIdx if it is set.
@@ -3982,21 +3987,33 @@ void RegisterCoalescer::lateLiveIntervalUpdate() {
 bool RegisterCoalescer::
 copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
   bool Progress = false;
+  SmallPtrSet<MachineInstr *, 4> CurrentErasedInstrs;
   for (MachineInstr *&MI : CurrList) {
     if (!MI)
       continue;
     // Skip instruction pointers that have already been erased, for example by
     // dead code elimination.
-    if (ErasedInstrs.count(MI)) {
+    if (ErasedInstrs.count(MI) || CurrentErasedInstrs.count(MI)) {
       MI = nullptr;
       continue;
     }
     bool Again = false;
-    bool Success = joinCopy(MI, Again);
+    bool Success = joinCopy(MI, Again, CurrentErasedInstrs);
     Progress |= Success;
     if (Success || !Again)
       MI = nullptr;
   }
+  // Clear instructions not recorded in `ErasedInstrs` but erased.
+  if (!CurrentErasedInstrs.empty()) {
+    for (MachineInstr *&MI : CurrList) {
+      if (MI && CurrentErasedInstrs.count(MI))
+        MI = nullptr;
+    }
+    for (MachineInstr *&MI : WorkList) {
+      if (MI && CurrentErasedInstrs.count(MI))
+        MI = nullptr;
+    }
+  }
   return Progress;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 98d8a6d9409f25..5038f8a1fc1562 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3575,6 +3575,11 @@ static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI,
     return SDValue();
   if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
     return SDValue();
+  // Guarantee identical type of CarryOut
+  EVT CarryOutType = N->getValueType(0);
+  if (CarryOutType != Carry0.getValue(1).getValueType() ||
+      CarryOutType != Carry1.getValue(1).getValueType())
+    return SDValue();
 
   // Canonicalize the add/sub of A and B (the top node in the above ASCII art)
   // as Carry0 and the add/sub of the carry in as Carry1 (the middle node).
@@ -3622,7 +3627,7 @@ static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI,
   // TODO: match other operations that can merge flags (ADD, etc)
   DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0));
   if (N->getOpcode() == ISD::AND)
-    return DAG.getConstant(0, DL, MVT::i1);
+    return DAG.getConstant(0, DL, CarryOutType);
   return Merged.getValue(1);
 }
 
@@ -9253,7 +9258,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
 
   // Transfer chain users from old loads to the new load.
   for (LoadSDNode *L : Loads)
-    DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
+    DAG.makeEquivalentMemoryOrdering(L, NewLoad);
 
   if (!NeedsBswap)
     return NewLoad;
@@ -9631,8 +9636,15 @@ static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) {
     if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth())
       return false;
 
+    // The fold is not valid if the sum of the shift values doesn't fit in the
+    // given shift amount type.
+    bool Overflow = false;
+    APInt NewShiftAmt = C1Val.uadd_ov(*ShiftAmtVal, Overflow);
+    if (Overflow)
+      return false;
+
     // The fold is not valid if the sum of the shift values exceeds bitwidth.
-    if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits()))
+    if (NewShiftAmt.uge(V.getScalarValueSizeInBits()))
       return false;
 
     return true;
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index fd5160209506f2..19076771ff2eaf 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -2045,6 +2045,11 @@ static bool isEqual(const Function &Caller, const Function &Callee) {
          Callee.getFnAttribute(AttrClass::getKind());
 }
 
+static bool isEqual(const Function &Caller, const Function &Callee,
+                    const StringRef &AttrName) {
+  return Caller.getFnAttribute(AttrName) == Callee.getFnAttribute(AttrName);
+}
+
 /// Compute the logical AND of the attributes of the caller and the
 /// callee.
 ///
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index cbb64b299e648e..f105bdb4816aa0 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -746,7 +746,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
       Min = Min.zext(ResultBitWidth);
       Max = Max.zext(ResultBitWidth);
     }
-    return ConstantRange(std::move(Min), std::move(Max));
+    return getNonEmpty(std::move(Min), std::move(Max) + 1);
   }
   case Instruction::SIToFP: {
     // TODO: use input range if available
@@ -757,7 +757,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
       SMin = SMin.sext(ResultBitWidth);
       SMax = SMax.sext(ResultBitWidth);
     }
-    return ConstantRange(std::move(SMin), std::move(SMax));
+    return getNonEmpty(std::move(SMin), std::move(SMax) + 1);
   }
   case Instruction::FPTrunc:
   case Instruction::FPExt:
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 8e508dbdb1c69b..026d252ec5bcd7 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -44,6 +44,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
@@ -1950,7 +1951,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       Lex();
     }
 
-    if (MAI.hasSubsectionsViaSymbols() && CFIStartProcLoc && Sym->isExternal())
+    if (MAI.hasSubsectionsViaSymbols() && CFIStartProcLoc &&
+        Sym->isExternal() && !cast<MCSymbolMachO>(Sym)->isAltEntry())
       return Error(StartTokLoc, "non-private labels cannot appear between "
                                 ".cfi_startproc / .cfi_endproc pairs") &&
              Error(*CFIStartProcLoc, "previous .cfi_startproc was here");
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 155926a8c5949d..1a7ed2db543964 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -677,6 +677,13 @@ static bool isECObject(object::SymbolicFile &Obj) {
   return false;
 }
 
+bool isImportDescriptor(StringRef Name) {
+  return Name.starts_with(ImportDescriptorPrefix) ||
+         Name == StringRef{NullImportDescriptorSymbolName} ||
+         (Name.starts_with(NullThunkDataPrefix) &&
+          Name.ends_with(NullThunkDataSuffix));
+}
+
 static Expected<std::vector<unsigned>> getSymbols(SymbolicFile *Obj,
                                                   uint16_t Index,
                                                   raw_ostream &SymNames,
@@ -704,6 +711,10 @@ static Expected<std::vector<unsigned>> getSymbols(SymbolicFile *Obj,
       if (Map == &SymMap->Map) {
         Ret.push_back(SymNames.tell());
         SymNames << Name << '\0';
+        // If EC is enabled, then the import descriptors are NOT put into EC
+        // objects so we need to copy them to the EC map manually.
+        if (SymMap->UseECMap && isImportDescriptor(Name))
+          SymMap->ECMap[Name] = Index;
       }
     } else {
       Ret.push_back(SymNames.tell());
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index 60556c149bf735..d3b5cf2d9f7b52 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -52,6 +52,38 @@ StringRef COFFImportFile::getFileFormatName() const {
   }
 }
 
+StringRef COFFImportFile::getExportName() const {
+  const coff_import_header *hdr = getCOFFImportHeader();
+  StringRef name = Data.getBuffer().substr(sizeof(*hdr)).split('\0').first;
+
+  auto ltrim1 = [](StringRef s, StringRef chars) {
+    return !s.empty() && chars.contains(s[0]) ? s.substr(1) : s;
+  };
+
+  switch (hdr->getNameType()) {
+  case IMPORT_ORDINAL:
+    name = "";
+    break;
+  case IMPORT_NAME_NOPREFIX:
+    name = ltrim1(name, "?@_");
+    break;
+  case IMPORT_NAME_UNDECORATE:
+    name = ltrim1(name, "?@_");
+    name = name.substr(0, name.find('@'));
+    break;
+  case IMPORT_NAME_EXPORTAS: {
+    // Skip DLL name
+    name = Data.getBuffer().substr(sizeof(*hdr) + name.size() + 1);
+    name = name.split('\0').second.split('\0').first;
+    break;
+  }
+  default:
+    break;
+  }
+
+  return name;
+}
+
 static uint16_t getImgRelRelocation(MachineTypes Machine) {
   switch (Machine) {
   default:
@@ -76,7 +108,7 @@ template <class T> static void append(std::vector<uint8_t> &B, const T &Data) {
 }
 
 static void writeStringTable(std::vector<uint8_t> &B,
-                             ArrayRef<const std::string> Strings) {
+                             ArrayRef<const std::string_view> Strings) {
   // The COFF string table consists of a 4-byte value which is the size of the
   // table, including the length field itself.  This value is followed by the
   // string content itself, which is an array of null-terminated C-style
@@ -139,9 +171,6 @@ static Expected<std::string> replace(StringRef S, StringRef From,
   return (Twine(S.substr(0, Pos)) + To + S.substr(Pos + From.size())).str();
 }
 
-static const std::string NullImportDescriptorSymbolName =
-    "__NULL_IMPORT_DESCRIPTOR";
-
 namespace {
 // This class constructs various small object files necessary to support linking
 // symbols imported from a DLL.  The contents are pretty strictly defined and
@@ -160,8 +189,9 @@ class ObjectFactory {
 public:
   ObjectFactory(StringRef S, MachineTypes M)
       : NativeMachine(M), ImportName(S), Library(llvm::sys::path::stem(S)),
-        ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()),
-        NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {}
+        ImportDescriptorSymbolName((ImportDescriptorPrefix + Library).str()),
+        NullThunkSymbolName(
+            (NullThunkDataPrefix + Library + NullThunkDataSuffix).str()) {}
 
   // Creates an Import Descriptor.  This is a small object file which contains a
   // reference to the terminators and contains the library name (entry) for the
@@ -183,6 +213,7 @@ class ObjectFactory {
   // Library Format.
   NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
                                      ImportType Type, ImportNameType NameType,
+                                     StringRef ExportName,
                                      MachineTypes Machine);
 
   // Create a weak external file which is described in PE/COFF Aux Format 3.
@@ -474,12 +505,13 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
   return {MemoryBufferRef{F, ImportName}};
 }
 
-NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
-                                                  uint16_t Ordinal,
-                                                  ImportType ImportType,
-                                                  ImportNameType NameType,
-                                                  MachineTypes Machine) {
+NewArchiveMember
+ObjectFactory::createShortImport(StringRef Sym, uint16_t Ordinal,
+                                 ImportType ImportType, ImportNameType NameType,
+                                 StringRef ExportName, MachineTypes Machine) {
   size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs
+  if (!ExportName.empty())
+    ImpSize += ExportName.size() + 1;
   size_t Size = sizeof(coff_import_header) + ImpSize;
   char *Buf = Alloc.Allocate<char>(Size);
   memset(Buf, 0, Size);
@@ -499,6 +531,10 @@ NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
   memcpy(P, Sym.data(), Sym.size());
   P += Sym.size() + 1;
   memcpy(P, ImportName.data(), ImportName.size());
+  if (!ExportName.empty()) {
+    P += ImportName.size() + 1;
+    memcpy(P, ExportName.data(), ExportName.size());
+  }
 
   return {MemoryBufferRef(StringRef(Buf, Size), ImportName)};
 }
@@ -615,27 +651,51 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
       ImportType = IMPORT_CONST;
 
     StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
-    ImportNameType NameType = E.Noname
-                                  ? IMPORT_ORDINAL
-                                  : getNameType(SymbolName, E.Name,
-                                                Machine, MinGW);
-    Expected<std::string> Name = E.ExtName.empty()
-                                     ? std::string(SymbolName)
-                                     : replace(SymbolName, E.Name, E.ExtName);
-
-    if (!Name)
-      return Name.takeError();
-
-    if (!E.AliasTarget.empty() && *Name != E.AliasTarget) {
+    std::string Name;
+
+    if (E.ExtName.empty()) {
+      Name = std::string(SymbolName);
+    } else {
+      Expected<std::string> ReplacedName =
+          replace(SymbolName, E.Name, E.ExtName);
+      if (!ReplacedName)
+        return ReplacedName.takeError();
+      Name.swap(*ReplacedName);
+    }
+
+    if (!E.AliasTarget.empty() && Name != E.AliasTarget) {
       Members.push_back(
-          OF.createWeakExternal(E.AliasTarget, *Name, false, Machine));
+          OF.createWeakExternal(E.AliasTarget, Name, false, Machine));
       Members.push_back(
-          OF.createWeakExternal(E.AliasTarget, *Name, true, Machine));
+          OF.createWeakExternal(E.AliasTarget, Name, true, Machine));
       continue;
     }
 
-    Members.push_back(
-        OF.createShortImport(*Name, E.Ordinal, ImportType, NameType, Machine));
+    ImportNameType NameType;
+    std::string ExportName;
+    if (E.Noname) {
+      NameType = IMPORT_ORDINAL;
+    } else {
+      NameType = getNameType(SymbolName, E.Name, Machine, MinGW);
+    }
+
+    // On ARM64EC, use EXPORTAS to import demangled name for mangled symbols.
+    if (ImportType == IMPORT_CODE && isArm64EC(Machine)) {
+      if (std::optional<std::string> MangledName =
+              getArm64ECMangledFunctionName(Name)) {
+        if (ExportName.empty()) {
+          NameType = IMPORT_NAME_EXPORTAS;
+          ExportName.swap(Name);
+        }
+        Name = std::move(*MangledName);
+      } else if (ExportName.empty()) {
+        NameType = IMPORT_NAME_EXPORTAS;
+        ExportName = std::move(*getArm64ECDemangledFunctionName(Name));
+      }
+    }
+
+    Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType,
+                                           NameType, ExportName, Machine));
   }
 
   return writeArchive(Path, Members, SymtabWritingMode::NormalSymtab,
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index da8e1d87319dde..a357b4cb492111 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ProfileData/Coverage/CoverageMapping.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -583,6 +584,160 @@ static unsigned getMaxBitmapSize(const CounterMappingContext &Ctx,
   return MaxBitmapID + (SizeInBits / CHAR_BIT);
 }
 
+namespace {
+
+/// Collect Decisions, Branchs, and Expansions and associate them.
+class MCDCDecisionRecorder {
+private:
+  /// This holds the DecisionRegion and MCDCBranches under it.
+  /// Also traverses Expansion(s).
+  /// The Decision has the number of MCDCBranches and will complete
+  /// when it is filled with unique ConditionID of MCDCBranches.
+  struct DecisionRecord {
+    const CounterMappingRegion *DecisionRegion;
+
+    /// They are reflected from DecisionRegion for convenience.
+    LineColPair DecisionStartLoc;
+    LineColPair DecisionEndLoc;
+
+    /// This is passed to `MCDCRecordProcessor`, so this should be compatible
+    /// to`ArrayRef<const CounterMappingRegion *>`.
+    SmallVector<const CounterMappingRegion *> MCDCBranches;
+
+    /// IDs that are stored in MCDCBranches
+    /// Complete when all IDs (1 to NumConditions) are met.
+    DenseSet<CounterMappingRegion::MCDCConditionID> ConditionIDs;
+
+    /// Set of IDs of Expansion(s) that are relevant to DecisionRegion
+    /// and its children (via expansions).
+    /// FileID  pointed by ExpandedFileID is dedicated to the expansion, so
+    /// the location in the expansion doesn't matter.
+    DenseSet<unsigned> ExpandedFileIDs;
+
+    DecisionRecord(const CounterMappingRegion &Decision)
+        : DecisionRegion(&Decision), DecisionStartLoc(Decision.startLoc()),
+          DecisionEndLoc(Decision.endLoc()) {
+      assert(Decision.Kind == CounterMappingRegion::MCDCDecisionRegion);
+    }
+
+    /// Determine whether DecisionRecord dominates `R`.
+    bool dominates(const CounterMappingRegion &R) const {
+      // Determine whether `R` is included in `DecisionRegion`.
+      if (R.FileID == DecisionRegion->FileID &&
+          R.startLoc() >= DecisionStartLoc && R.endLoc() <= DecisionEndLoc)
+        return true;
+
+      // Determine whether `R` is pointed by any of Expansions.
+      return ExpandedFileIDs.contains(R.FileID);
+    }
+
+    enum Result {
+      NotProcessed = 0, /// Irrelevant to this Decision
+      Processed,        /// Added to this Decision
+      Completed,        /// Added and filled this Decision
+    };
+
+    /// Add Branch into the Decision
+    /// \param Branch expects MCDCBranchRegion
+    /// \returns NotProcessed/Processed/Completed
+    Result addBranch(const CounterMappingRegion &Branch) {
+      assert(Branch.Kind == CounterMappingRegion::MCDCBranchRegion);
+
+      auto ConditionID = Branch.MCDCParams.ID;
+      assert(ConditionID > 0 && "ConditionID should begin with 1");
+
+      if (ConditionIDs.contains(ConditionID) ||
+          ConditionID > DecisionRegion->MCDCParams.NumConditions)
+        return NotProcessed;
+
+      if (!this->dominates(Branch))
+        return NotProcessed;
+
+      assert(MCDCBranches.size() < DecisionRegion->MCDCParams.NumConditions);
+
+      // Put `ID=1` in front of `MCDCBranches` for convenience
+      // even if `MCDCBranches` is not topological.
+      if (ConditionID == 1)
+        MCDCBranches.insert(MCDCBranches.begin(), &Branch);
+      else
+        MCDCBranches.push_back(&Branch);
+
+      // Mark `ID` as `assigned`.
+      ConditionIDs.insert(ConditionID);
+
+      // `Completed` when `MCDCBranches` is full
+      return (MCDCBranches.size() == DecisionRegion->MCDCParams.NumConditions
+                  ? Completed
+                  : Processed);
+    }
+
+    /// Record Expansion if it is relevant to this Decision.
+    /// Each `Expansion` may nest.
+    /// \returns true if recorded.
+    bool recordExpansion(const CounterMappingRegion &Expansion) {
+      if (!this->dominates(Expansion))
+        return false;
+
+      ExpandedFileIDs.insert(Expansion.ExpandedFileID);
+      return true;
+    }
+  };
+
+private:
+  /// Decisions in progress
+  /// DecisionRecord is added for each MCDCDecisionRegion.
+  /// DecisionRecord is removed when Decision is completed.
+  SmallVector<DecisionRecord> Decisions;
+
+public:
+  ~MCDCDecisionRecorder() {
+    assert(Decisions.empty() && "All Decisions have not been resolved");
+  }
+
+  /// Register Region and start recording.
+  void registerDecision(const CounterMappingRegion &Decision) {
+    Decisions.emplace_back(Decision);
+  }
+
+  void recordExpansion(const CounterMappingRegion &Expansion) {
+    any_of(Decisions, [&Expansion](auto &Decision) {
+      return Decision.recordExpansion(Expansion);
+    });
+  }
+
+  using DecisionAndBranches =
+      std::pair<const CounterMappingRegion *,             /// Decision
+                SmallVector<const CounterMappingRegion *> /// Branches
+                >;
+
+  /// Add MCDCBranchRegion to DecisionRecord.
+  /// \param Branch to be processed
+  /// \returns DecisionsAndBranches if DecisionRecord completed.
+  ///     Or returns nullopt.
+  std::optional<DecisionAndBranches>
+  processBranch(const CounterMappingRegion &Branch) {
+    // Seek each Decision and apply Region to it.
+    for (auto DecisionIter = Decisions.begin(), DecisionEnd = Decisions.end();
+         DecisionIter != DecisionEnd; ++DecisionIter)
+      switch (DecisionIter->addBranch(Branch)) {
+      case DecisionRecord::NotProcessed:
+        continue;
+      case DecisionRecord::Processed:
+        return std::nullopt;
+      case DecisionRecord::Completed:
+        DecisionAndBranches Result =
+            std::make_pair(DecisionIter->DecisionRegion,
+                           std::move(DecisionIter->MCDCBranches));
+        Decisions.erase(DecisionIter); // No longer used.
+        return Result;
+      }
+
+    llvm_unreachable("Branch not found in Decisions");
+  }
+};
+
+} // namespace
+
 Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
     IndexedInstrProfReader &ProfileReader) {
@@ -639,18 +794,13 @@ Error CoverageMapping::loadFunctionRecord(
       Record.MappingRegions[0].Count.isZero() && Counts[0] > 0)
     return Error::success();
 
-  unsigned NumConds = 0;
-  const CounterMappingRegion *MCDCDecision;
-  std::vector<const CounterMappingRegion *> MCDCBranches;
-
+  MCDCDecisionRecorder MCDCDecisions;
   FunctionRecord Function(OrigFuncName, Record.Filenames);
   for (const auto &Region : Record.MappingRegions) {
-    // If an MCDCDecisionRegion is seen, track the BranchRegions that follow
-    // it according to Region.NumConditions.
+    // MCDCDecisionRegion should be handled first since it overlaps with
+    // others inside.
     if (Region.Kind == CounterMappingRegion::MCDCDecisionRegion) {
-      assert(NumConds == 0);
-      MCDCDecision = &Region;
-      NumConds = Region.MCDCParams.NumConditions;
+      MCDCDecisions.registerDecision(Region);
       continue;
     }
     Expected<int64_t> ExecutionCount = Ctx.evaluate(Region.Count);
@@ -665,43 +815,47 @@ Error CoverageMapping::loadFunctionRecord(
     }
     Function.pushRegion(Region, *ExecutionCount, *AltExecutionCount);
 
-    // If a MCDCDecisionRegion was seen, store the BranchRegions that
-    // correspond to it in a vector, according to the number of conditions
-    // recorded for the region (tracked by NumConds).
-    if (NumConds > 0 && Region.Kind == CounterMappingRegion::MCDCBranchRegion) {
-      MCDCBranches.push_back(&Region);
-
-      // As we move through all of the MCDCBranchRegions that follow the
-      // MCDCDecisionRegion, decrement NumConds to make sure we account for
-      // them all before we calculate the bitmap of executed test vectors.
-      if (--NumConds == 0) {
-        // Evaluating the test vector bitmap for the decision region entails
-        // calculating precisely what bits are pertinent to this region alone.
-        // This is calculated based on the recorded offset into the global
-        // profile bitmap; the length is calculated based on the recorded
-        // number of conditions.
-        Expected<BitVector> ExecutedTestVectorBitmap =
-            Ctx.evaluateBitmap(MCDCDecision);
-        if (auto E = ExecutedTestVectorBitmap.takeError()) {
-          consumeError(std::move(E));
-          return Error::success();
-        }
+    // Record ExpansionRegion.
+    if (Region.Kind == CounterMappingRegion::ExpansionRegion) {
+      MCDCDecisions.recordExpansion(Region);
+      continue;
+    }
 
-        // Since the bitmap identifies the executed test vectors for an MC/DC
-        // DecisionRegion, all of the information is now available to process.
-        // This is where the bulk of the MC/DC progressing takes place.
-        Expected<MCDCRecord> Record = Ctx.evaluateMCDCRegion(
-            *MCDCDecision, *ExecutedTestVectorBitmap, MCDCBranches);
-        if (auto E = Record.takeError()) {
-          consumeError(std::move(E));
-          return Error::success();
-        }
+    // Do nothing unless MCDCBranchRegion.
+    if (Region.Kind != CounterMappingRegion::MCDCBranchRegion)
+      continue;
 
-        // Save the MC/DC Record so that it can be visualized later.
-        Function.pushMCDCRecord(*Record);
-        MCDCBranches.clear();
-      }
+    auto Result = MCDCDecisions.processBranch(Region);
+    if (!Result) // Any Decision doesn't complete.
+      continue;
+
+    auto MCDCDecision = Result->first;
+    auto &MCDCBranches = Result->second;
+
+    // Evaluating the test vector bitmap for the decision region entails
+    // calculating precisely what bits are pertinent to this region alone.
+    // This is calculated based on the recorded offset into the global
+    // profile bitmap; the length is calculated based on the recorded
+    // number of conditions.
+    Expected<BitVector> ExecutedTestVectorBitmap =
+        Ctx.evaluateBitmap(MCDCDecision);
+    if (auto E = ExecutedTestVectorBitmap.takeError()) {
+      consumeError(std::move(E));
+      return Error::success();
     }
+
+    // Since the bitmap identifies the executed test vectors for an MC/DC
+    // DecisionRegion, all of the information is now available to process.
+    // This is where the bulk of the MC/DC progressing takes place.
+    Expected<MCDCRecord> Record = Ctx.evaluateMCDCRegion(
+        *MCDCDecision, *ExecutedTestVectorBitmap, MCDCBranches);
+    if (auto E = Record.takeError()) {
+      consumeError(std::move(E));
+      return Error::success();
+    }
+
+    // Save the MC/DC Record so that it can be visualized later.
+    Function.pushMCDCRecord(*Record);
   }
 
   // Don't create records for (filenames, function) pairs we've already seen.
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 1c7d8a8909c488..27727f216b0513 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -167,7 +167,15 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
       return LHS.FileID < RHS.FileID;
     if (LHS.startLoc() != RHS.startLoc())
       return LHS.startLoc() < RHS.startLoc();
-    return LHS.Kind < RHS.Kind;
+
+    // Put `Decision` before `Expansion`.
+    auto getKindKey = [](CounterMappingRegion::RegionKind Kind) {
+      return (Kind == CounterMappingRegion::MCDCDecisionRegion
+                  ? 2 * CounterMappingRegion::ExpansionRegion - 1
+                  : 2 * Kind);
+    };
+
+    return getKindKey(LHS.Kind) < getKindKey(RHS.Kind);
   });
 
   // Write out the fileid -> filename mapping.
diff --git a/llvm/lib/Support/FormattedStream.cpp b/llvm/lib/Support/FormattedStream.cpp
index c0d28435099570..c50530e76efc0a 100644
--- a/llvm/lib/Support/FormattedStream.cpp
+++ b/llvm/lib/Support/FormattedStream.cpp
@@ -94,6 +94,9 @@ void formatted_raw_ostream::UpdatePosition(const char *Ptr, size_t Size) {
 /// ComputePosition - Examine the current output and update line and column
 /// counts.
 void formatted_raw_ostream::ComputePosition(const char *Ptr, size_t Size) {
+  if (DisableScan)
+    return;
+
   // If our previous scan pointer is inside the buffer, assume we already
   // scanned those bytes. This depends on raw_ostream to not change our buffer
   // in unexpected ways.
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 3c02492e99f1db..db2e4ca92ae40a 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -128,6 +128,7 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"zicclsm", {1, 0}},
     {"ziccrse", {1, 0}},
     {"zicntr", {2, 0}},
+    {"zicond", {1, 0}},
     {"zicsr", {2, 0}},
     {"zifencei", {2, 0}},
     {"zihintntl", {1, 0}},
@@ -200,8 +201,6 @@ static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
     {"zicfilp", {0, 4}},
     {"zicfiss", {0, 4}},
 
-    {"zicond", {1, 0}},
-
     {"zimop", {0, 1}},
 
     {"ztso", {0, 1}},
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 36700f73df4b20..feabd137c0cf1d 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -837,6 +837,7 @@ include "AArch64SchedA64FX.td"
 include "AArch64SchedThunderX3T110.td"
 include "AArch64SchedTSV110.td"
 include "AArch64SchedAmpere1.td"
+include "AArch64SchedAmpere1B.td"
 include "AArch64SchedNeoverseN1.td"
 include "AArch64SchedNeoverseN2.td"
 include "AArch64SchedNeoverseV1.td"
@@ -1376,6 +1377,24 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeatureLdpAlignedOnly,
                                     FeatureStpAlignedOnly]>;
 
+def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
+                                    "Ampere Computing Ampere-1B processors", [
+                                    FeaturePostRAScheduler,
+                                    FeatureFuseAES,
+                                    FeatureFuseAdrpAdd,
+                                    FeatureAddrLSLFast,
+                                    FeatureALULSLFast,
+                                    FeatureAggressiveFMA,
+                                    FeatureArithmeticBccFusion,
+                                    FeatureCmpBccFusion,
+                                    FeatureFuseAddress,
+                                    FeatureFuseLiterals,
+                                    FeatureStorePairSuppress,
+                                    FeatureEnableSelectOptimize,
+                                    FeaturePredictableSelectIsExpensive,
+                                    FeatureLdpAlignedOnly,
+                                    FeatureStpAlignedOnly]>;
+
 def ProcessorFeatures {
   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
                                  FeatureFPARMv8, FeatureNEON, FeaturePerfMon];
@@ -1529,6 +1548,11 @@ def ProcessorFeatures {
                                      FeatureMTE, FeatureSSBS, FeatureRandGen,
                                      FeatureSB, FeatureSM4, FeatureSHA2,
                                      FeatureSHA3, FeatureAES];
+  list<SubtargetFeature> Ampere1B = [HasV8_7aOps, FeatureNEON, FeaturePerfMon,
+                                     FeatureMTE, FeatureSSBS, FeatureRandGen,
+                                     FeatureSB, FeatureSM4, FeatureSHA2,
+                                     FeatureSHA3, FeatureAES, FeatureCSSC,
+                                     FeatureWFxT, FeatureFullFP16];
 
   // ETE and TRBE are future architecture extensions. We temporarily enable them
   // by default for users targeting generic AArch64. The extensions do not
@@ -1696,6 +1720,9 @@ def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
 def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A,
                      [TuneAmpere1A]>;
 
+def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B,
+                     [TuneAmpere1B]>;
+
 //===----------------------------------------------------------------------===//
 // Assembly parser
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 11248bb7aef31f..55c5bbc66a3f4f 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -24,11 +24,13 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
+using namespace llvm::object;
 
 using OperandBundleDef = OperandBundleDefT<Value *>;
 
@@ -43,6 +45,8 @@ static cl::opt<bool> GenerateThunks("arm64ec-generate-thunks", cl::Hidden,
 
 namespace {
 
+enum class ThunkType { GuestExit, Entry, Exit };
+
 class AArch64Arm64ECCallLowering : public ModulePass {
 public:
   static char ID;
@@ -69,14 +73,14 @@ class AArch64Arm64ECCallLowering : public ModulePass {
   Type *I64Ty;
   Type *VoidTy;
 
-  void getThunkType(FunctionType *FT, AttributeList AttrList, bool EntryThunk,
+  void getThunkType(FunctionType *FT, AttributeList AttrList, ThunkType TT,
                     raw_ostream &Out, FunctionType *&Arm64Ty,
                     FunctionType *&X64Ty);
   void getThunkRetType(FunctionType *FT, AttributeList AttrList,
                        raw_ostream &Out, Type *&Arm64RetTy, Type *&X64RetTy,
                        SmallVectorImpl<Type *> &Arm64ArgTypes,
                        SmallVectorImpl<Type *> &X64ArgTypes, bool &HasSretPtr);
-  void getThunkArgTypes(FunctionType *FT, AttributeList AttrList,
+  void getThunkArgTypes(FunctionType *FT, AttributeList AttrList, ThunkType TT,
                         raw_ostream &Out,
                         SmallVectorImpl<Type *> &Arm64ArgTypes,
                         SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr);
@@ -89,10 +93,11 @@ class AArch64Arm64ECCallLowering : public ModulePass {
 
 void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
                                               AttributeList AttrList,
-                                              bool EntryThunk, raw_ostream &Out,
+                                              ThunkType TT, raw_ostream &Out,
                                               FunctionType *&Arm64Ty,
                                               FunctionType *&X64Ty) {
-  Out << (EntryThunk ? "$ientry_thunk$cdecl$" : "$iexit_thunk$cdecl$");
+  Out << (TT == ThunkType::Entry ? "$ientry_thunk$cdecl$"
+                                 : "$iexit_thunk$cdecl$");
 
   Type *Arm64RetTy;
   Type *X64RetTy;
@@ -102,8 +107,8 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
 
   // The first argument to a thunk is the called function, stored in x9.
   // For exit thunks, we pass the called function down to the emulator;
-  // for entry thunks, we just call the Arm64 function directly.
-  if (!EntryThunk)
+  // for entry/guest exit thunks, we just call the Arm64 function directly.
+  if (TT == ThunkType::Exit)
     Arm64ArgTypes.push_back(PtrTy);
   X64ArgTypes.push_back(PtrTy);
 
@@ -111,14 +116,16 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
   getThunkRetType(FT, AttrList, Out, Arm64RetTy, X64RetTy, Arm64ArgTypes,
                   X64ArgTypes, HasSretPtr);
 
-  getThunkArgTypes(FT, AttrList, Out, Arm64ArgTypes, X64ArgTypes, HasSretPtr);
+  getThunkArgTypes(FT, AttrList, TT, Out, Arm64ArgTypes, X64ArgTypes,
+                   HasSretPtr);
 
   Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes, false);
+
   X64Ty = FunctionType::get(X64RetTy, X64ArgTypes, false);
 }
 
 void AArch64Arm64ECCallLowering::getThunkArgTypes(
-    FunctionType *FT, AttributeList AttrList, raw_ostream &Out,
+    FunctionType *FT, AttributeList AttrList, ThunkType TT, raw_ostream &Out,
     SmallVectorImpl<Type *> &Arm64ArgTypes,
     SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr) {
 
@@ -156,9 +163,11 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
     X64ArgTypes.push_back(PtrTy);
     // x5
     Arm64ArgTypes.push_back(I64Ty);
-    // FIXME: x5 isn't actually passed/used by the x64 side; revisit once we
-    // have proper isel for varargs
-    X64ArgTypes.push_back(I64Ty);
+    if (TT != ThunkType::Entry) {
+      // FIXME: x5 isn't actually used by the x64 side; revisit once we
+      // have proper isel for varargs
+      X64ArgTypes.push_back(I64Ty);
+    }
     return;
   }
 
@@ -339,8 +348,7 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
   SmallString<256> ExitThunkName;
   llvm::raw_svector_ostream ExitThunkStream(ExitThunkName);
   FunctionType *Arm64Ty, *X64Ty;
-  getThunkType(FT, Attrs, /*EntryThunk*/ false, ExitThunkStream, Arm64Ty,
-               X64Ty);
+  getThunkType(FT, Attrs, ThunkType::Exit, ExitThunkStream, Arm64Ty, X64Ty);
   if (Function *F = M->getFunction(ExitThunkName))
     return F;
 
@@ -443,7 +451,7 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
   SmallString<256> EntryThunkName;
   llvm::raw_svector_ostream EntryThunkStream(EntryThunkName);
   FunctionType *Arm64Ty, *X64Ty;
-  getThunkType(F->getFunctionType(), F->getAttributes(), /*EntryThunk*/ true,
+  getThunkType(F->getFunctionType(), F->getAttributes(), ThunkType::Entry,
                EntryThunkStream, Arm64Ty, X64Ty);
   if (Function *F = M->getFunction(EntryThunkName))
     return F;
@@ -465,10 +473,11 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
 
   bool TransformDirectToSRet = X64RetType->isVoidTy() && !RetTy->isVoidTy();
   unsigned ThunkArgOffset = TransformDirectToSRet ? 2 : 1;
+  unsigned PassthroughArgSize = F->isVarArg() ? 5 : Thunk->arg_size();
 
   // Translate arguments to call.
   SmallVector<Value *> Args;
-  for (unsigned i = ThunkArgOffset, e = Thunk->arg_size(); i != e; ++i) {
+  for (unsigned i = ThunkArgOffset, e = PassthroughArgSize; i != e; ++i) {
     Value *Arg = Thunk->getArg(i);
     Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset);
     if (ArgTy->isArrayTy() || ArgTy->isStructTy() ||
@@ -485,6 +494,22 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
     Args.push_back(Arg);
   }
 
+  if (F->isVarArg()) {
+    // The 5th argument to variadic entry thunks is used to model the x64 sp
+    // which is passed to the thunk in x4, this can be passed to the callee as
+    // the variadic argument start address after skipping over the 32 byte
+    // shadow store.
+
+    // The EC thunk CC will assign any argument marked as InReg to x4.
+    Thunk->addParamAttr(5, Attribute::InReg);
+    Value *Arg = Thunk->getArg(5);
+    Arg = IRB.CreatePtrAdd(Arg, IRB.getInt64(0x20));
+    Args.push_back(Arg);
+
+    // Pass in a zero variadic argument size (in x5).
+    Args.push_back(IRB.getInt64(0));
+  }
+
   // Call the function passed to the thunk.
   Value *Callee = Thunk->getArg(0);
   Callee = IRB.CreateBitCast(Callee, PtrTy);
@@ -518,7 +543,7 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
 Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
   llvm::raw_null_ostream NullThunkName;
   FunctionType *Arm64Ty, *X64Ty;
-  getThunkType(F->getFunctionType(), F->getAttributes(), /*EntryThunk*/ true,
+  getThunkType(F->getFunctionType(), F->getAttributes(), ThunkType::GuestExit,
                NullThunkName, Arm64Ty, X64Ty);
   auto MangledName = getArm64ECMangledFunctionName(F->getName().str());
   assert(MangledName && "Can't guest exit to function that's already native");
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 78ea4a5180f703..8e67f0f5c8815f 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -213,6 +213,9 @@ def CC_AArch64_Arm64EC_VarArg : CallingConv<[
 // address is passed in X9.
 let Entry = 1 in
 def CC_AArch64_Arm64EC_Thunk : CallingConv<[
+  // ARM64EC-specific: the InReg attribute can be used to access the x64 sp passed into entry thunks in x4 from the IR.
+  CCIfInReg<CCIfType<[i64], CCAssignToReg<[X4]>>>,
+
   // Byval aggregates are passed by pointer
   CCIfByVal<CCPassIndirect<i64>>,
 
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 352c61d48e2fff..1af064b6de3cba 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1544,6 +1544,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
        NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
      return true;
    }
+   case AArch64::COALESCER_BARRIER_FPR16:
+   case AArch64::COALESCER_BARRIER_FPR32:
+   case AArch64::COALESCER_BARRIER_FPR64:
+   case AArch64::COALESCER_BARRIER_FPR128:
+     MI.eraseFromParent();
+     return true;
    case AArch64::LD1B_2Z_IMM_PSEUDO:
      return expandMultiVecPseudo(
          MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index d55deec9760092..732e787d2a321d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4339,8 +4339,10 @@ AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
   ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
   MBB.addSuccessor(LoopMBB);
   // Update liveins.
-  recomputeLiveIns(*LoopMBB);
-  recomputeLiveIns(*ExitMBB);
+  bool anyChange = false;
+  do {
+    anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
+  } while (anyChange);
 
   return ExitMBB->begin();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 332fb37655288c..95d8ab95b2c097 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1658,40 +1658,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setMaxAtomicSizeInBitsSupported(128);
 
   if (Subtarget->isWindowsArm64EC()) {
-    // FIXME: are there other intrinsics we need to add here?
-    setLibcallName(RTLIB::MEMCPY, "#memcpy");
-    setLibcallName(RTLIB::MEMSET, "#memset");
-    setLibcallName(RTLIB::MEMMOVE, "#memmove");
-    setLibcallName(RTLIB::REM_F32, "#fmodf");
-    setLibcallName(RTLIB::REM_F64, "#fmod");
-    setLibcallName(RTLIB::FMA_F32, "#fmaf");
-    setLibcallName(RTLIB::FMA_F64, "#fma");
-    setLibcallName(RTLIB::SQRT_F32, "#sqrtf");
-    setLibcallName(RTLIB::SQRT_F64, "#sqrt");
-    setLibcallName(RTLIB::CBRT_F32, "#cbrtf");
-    setLibcallName(RTLIB::CBRT_F64, "#cbrt");
-    setLibcallName(RTLIB::LOG_F32, "#logf");
-    setLibcallName(RTLIB::LOG_F64, "#log");
-    setLibcallName(RTLIB::LOG2_F32, "#log2f");
-    setLibcallName(RTLIB::LOG2_F64, "#log2");
-    setLibcallName(RTLIB::LOG10_F32, "#log10f");
-    setLibcallName(RTLIB::LOG10_F64, "#log10");
-    setLibcallName(RTLIB::EXP_F32, "#expf");
-    setLibcallName(RTLIB::EXP_F64, "#exp");
-    setLibcallName(RTLIB::EXP2_F32, "#exp2f");
-    setLibcallName(RTLIB::EXP2_F64, "#exp2");
-    setLibcallName(RTLIB::EXP10_F32, "#exp10f");
-    setLibcallName(RTLIB::EXP10_F64, "#exp10");
-    setLibcallName(RTLIB::SIN_F32, "#sinf");
-    setLibcallName(RTLIB::SIN_F64, "#sin");
-    setLibcallName(RTLIB::COS_F32, "#cosf");
-    setLibcallName(RTLIB::COS_F64, "#cos");
-    setLibcallName(RTLIB::POW_F32, "#powf");
-    setLibcallName(RTLIB::POW_F64, "#pow");
-    setLibcallName(RTLIB::LDEXP_F32, "#ldexpf");
-    setLibcallName(RTLIB::LDEXP_F64, "#ldexp");
-    setLibcallName(RTLIB::FREXP_F32, "#frexpf");
-    setLibcallName(RTLIB::FREXP_F64, "#frexp");
+    // FIXME: are there intrinsics we need to exclude from this?
+    for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
+      auto code = static_cast<RTLIB::Libcall>(i);
+      auto libcallName = getLibcallName(code);
+      if ((libcallName != nullptr) && (libcallName[0] != '#')) {
+        setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
+      }
+    }
   }
 }
 
@@ -2375,6 +2349,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((AArch64ISD::NodeType)Opcode) {
   case AArch64ISD::FIRST_NUMBER:
     break;
+    MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
     MAKE_CASE(AArch64ISD::SMSTART)
     MAKE_CASE(AArch64ISD::SMSTOP)
     MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -7154,13 +7129,18 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   }
 }
 
+static bool isPassedInFPR(EVT VT) {
+  return VT.isFixedLengthVector() ||
+         (VT.isFloatingPoint() && !VT.isScalableVector());
+}
+
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 SDValue AArch64TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
-    SDValue ThisVal) const {
+    SDValue ThisVal, bool RequiresSMChange) const {
   DenseMap<unsigned, SDValue> CopiedRegs;
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -7205,6 +7185,10 @@ SDValue AArch64TargetLowering::LowerCallResult(
       break;
     }
 
+    if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
+      Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(),
+                        Val);
+
     InVals.push_back(Val);
   }
 
@@ -7915,6 +7899,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
           return ArgReg.Reg == VA.getLocReg();
         });
       } else {
+        // Add an extra level of indirection for streaming mode changes by
+        // using a pseudo copy node that cannot be rematerialised between a
+        // smstart/smstop and the call by the simple register coalescer.
+        if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
+          Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
+                            Arg.getValueType(), Arg);
         RegsToPass.emplace_back(VA.getLocReg(), Arg);
         RegsUsed.insert(VA.getLocReg());
         const TargetOptions &Options = DAG.getTarget().Options;
@@ -7991,11 +7981,19 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   if (IsVarArg && Subtarget->isWindowsArm64EC()) {
+    SDValue ParamPtr = StackPtr;
+    if (IsTailCall) {
+      // Create a dummy object at the top of the stack that can be used to get
+      // the SP after the epilogue
+      int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
+      ParamPtr = DAG.getFrameIndex(FI, PtrVT);
+    }
+
     // For vararg calls, the Arm64EC ABI requires values in x4 and x5
     // describing the argument list.  x4 contains the address of the
     // first stack parameter. x5 contains the size in bytes of all parameters
     // passed on the stack.
-    RegsToPass.emplace_back(AArch64::X4, StackPtr);
+    RegsToPass.emplace_back(AArch64::X4, ParamPtr);
     RegsToPass.emplace_back(AArch64::X5,
                             DAG.getConstant(NumBytes, DL, MVT::i64));
   }
@@ -8151,9 +8149,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs,
-                                   DL, DAG, InVals, IsThisReturn,
-                                   IsThisReturn ? OutVals[0] : SDValue());
+  SDValue Result = LowerCallResult(
+      Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
+      IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
 
   if (!Ins.empty())
     InGlue = Result.getValue(Result->getNumValues() - 1);
@@ -10702,6 +10700,14 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
       parseConstraintCode(Constraint) != AArch64CC::Invalid)
     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
 
+  if (Constraint == "{za}") {
+    return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
+  }
+
+  if (Constraint == "{zt0}") {
+    return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
+  }
+
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
   std::pair<unsigned, const TargetRegisterClass *> Res;
@@ -24403,7 +24409,8 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
     return;
   }
 
-  if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1)
+  if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+      !VT.isVector())
     return replaceBoolVectorBitcast(N, Results, DAG);
 
   if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
@@ -26899,7 +26906,7 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
     return false;
 
   // If the vector is scalable, SVE is enabled, implying support for complex
-  // numbers. Otherwirse, we need to ensure complex number support is avaialble
+  // numbers. Otherwise, we need to ensure complex number support is available
   if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
     return false;
 
@@ -26915,7 +26922,7 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
       !llvm::isPowerOf2_32(VTyWidth))
     return false;
 
-  if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2()) {
+  if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
     unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
     return 8 <= ScalarWidth && ScalarWidth <= 64;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6505931e17e18d..74d0c4bde8dd2e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -58,6 +58,8 @@ enum NodeType : unsigned {
 
   CALL_BTI, // Function call followed by a BTI instruction.
 
+  COALESCER_BARRIER,
+
   SMSTART,
   SMSTOP,
   RESTORE_ZA,
@@ -999,6 +1001,9 @@ class AArch64TargetLowering : public TargetLowering {
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
+  llvm::BumpPtrAllocator BumpAlloc;
+  llvm::StringSaver Saver{BumpAlloc};
+
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
   void addTypeForNEON(MVT VT);
@@ -1026,7 +1031,7 @@ class AArch64TargetLowering : public TargetLowering {
                           const SmallVectorImpl<CCValAssign> &RVLocs,
                           const SDLoc &DL, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
-                          SDValue ThisVal) const;
+                          SDValue ThisVal, bool RequiresSMChange) const;
 
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 2e8d8c63d6bec2..9b4bb7c88bc821 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4098,16 +4098,6 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
   return MI.getOperand(Idx);
 }
 
-const MachineOperand &
-AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    llvm_unreachable("Unexpected opcode");
-  case AArch64::LDRBBroX:
-    return MI.getOperand(4);
-  }
-}
-
 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
                                               Register Reg) {
   if (MI.getParent() == nullptr)
@@ -9597,9 +9587,13 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
 
   // Update liveins.
   if (MF.getRegInfo().reservedRegsFrozen()) {
-    recomputeLiveIns(*LoopTestMBB);
-    recomputeLiveIns(*LoopBodyMBB);
-    recomputeLiveIns(*ExitMBB);
+    bool anyChange = false;
+    do {
+      anyChange = recomputeLiveIns(*ExitMBB) ||
+                  recomputeLiveIns(*LoopBodyMBB) ||
+                  recomputeLiveIns(*LoopTestMBB);
+    } while (anyChange);
+    ;
   }
 
   return ExitMBB->begin();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index db24a19fe5f8e3..6526f6740747ab 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -111,9 +111,6 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// Returns the immediate offset operator of a load/store.
   static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
 
-  /// Returns the shift amount operator of a load/store.
-  static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
-
   /// Returns whether the instruction is FP or NEON.
   static bool isFpOrNEON(const MachineInstr &MI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 03baa7497615e3..ac61dd8745d4e6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4885,19 +4885,9 @@ defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
 def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
                            (zext (v8i8 V64:$opB))))),
           (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
-def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
-               (v8i16 (add (sub (zext (v8i8 V64:$opA)),
-                                (zext (v8i8 V64:$opB))),
-                           (AArch64vashr v8i16:$src, (i32 15))))),
-          (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
 def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))),
                            (zext (extract_high_v16i8 (v16i8 V128:$opB)))))),
           (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
-def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
-               (v8i16 (add (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))),
-                                (zext (extract_high_v16i8 (v16i8 V128:$opB)))),
-                           (AArch64vashr v8i16:$src, (i32 15))))),
-          (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
 def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
                            (zext (v4i16 V64:$opB))))),
           (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index e90b8a8ca7acee..926a89466255ca 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -62,8 +62,6 @@ STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
-STATISTIC(NumConstOffsetFolded,
-          "Number of const offset of index address folded");
 
 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
               "Controls which pairs are considered for renaming");
@@ -77,11 +75,6 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
                                      cl::Hidden);
 
-// The LdStConstLimit limits how far we search for const offset instructions
-// when we form index address load/store instructions.
-static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
-                                        cl::init(10), cl::Hidden);
-
 // Enable register renaming to find additional store pairing opportunities.
 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
                                     cl::init(true), cl::Hidden);
@@ -178,13 +171,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
                                 int UnscaledOffset, unsigned Limit);
 
-  // Scan the instruction list to find a register assigned with a const
-  // value that can be combined with the current instruction (a load or store)
-  // using base addressing with writeback. Scan forwards.
-  MachineBasicBlock::iterator
-  findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
-                                  unsigned &Offset);
-
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan backwards.
@@ -196,19 +182,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
-  bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
-                              unsigned IndexReg, unsigned &Offset);
-
   // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
   mergeUpdateInsn(MachineBasicBlock::iterator I,
                   MachineBasicBlock::iterator Update, bool IsPreIdx);
 
-  MachineBasicBlock::iterator
-  mergeConstOffsetInsn(MachineBasicBlock::iterator I,
-                       MachineBasicBlock::iterator Update, unsigned Offset,
-                       int Scale);
-
   // Find and merge zero store instructions.
   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
 
@@ -221,9 +199,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
-  // Find and merge a index ldr/st instructions into a base ld/st instruction.
-  bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
-
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -506,16 +481,6 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
   }
 }
 
-static unsigned getBaseAddressOpcode(unsigned Opc) {
-  // TODO: Add more index address loads/stores.
-  switch (Opc) {
-  default:
-    llvm_unreachable("Opcode has no base address equivalent!");
-  case AArch64::LDRBBroX:
-    return AArch64::LDRBBui;
-  }
-}
-
 static unsigned getPostIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
@@ -757,20 +722,6 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
   }
 }
 
-// Make sure this is a reg+reg Ld/St
-static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
-  unsigned Opc = MI.getOpcode();
-  switch (Opc) {
-  default:
-    return false;
-  // Scaled instructions.
-  // TODO: Add more index address loads/stores.
-  case AArch64::LDRBBroX:
-    Scale = 1;
-    return true;
-  }
-}
-
 static bool isRewritableImplicitDef(unsigned Opc) {
   switch (Opc) {
   default:
@@ -2097,63 +2048,6 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
-MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
-                                          MachineBasicBlock::iterator Update,
-                                          unsigned Offset, int Scale) {
-  assert((Update->getOpcode() == AArch64::MOVKWi) &&
-         "Unexpected const mov instruction to merge!");
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
-  MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
-  MachineInstr &MemMI = *I;
-  unsigned Mask = (1 << 12) * Scale - 1;
-  unsigned Low = Offset & Mask;
-  unsigned High = Offset - Low;
-  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
-  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
-  MachineInstrBuilder AddMIB, MemMIB;
-
-  // Add IndexReg, BaseReg, High (the BaseReg may be SP)
-  AddMIB =
-      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
-          .addDef(IndexReg)
-          .addUse(BaseReg)
-          .addImm(High >> 12) // shifted value
-          .addImm(12);        // shift 12
-  (void)AddMIB;
-  // Ld/St DestReg, IndexReg, Imm12
-  unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
-  MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-               .add(getLdStRegOp(MemMI))
-               .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
-               .addImm(Low / Scale)
-               .setMemRefs(I->memoperands())
-               .setMIFlags(I->mergeFlagsWith(*Update));
-  (void)MemMIB;
-
-  ++NumConstOffsetFolded;
-  LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
-  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
-  LLVM_DEBUG(PrevI->print(dbgs()));
-  LLVM_DEBUG(dbgs() << "    ");
-  LLVM_DEBUG(Update->print(dbgs()));
-  LLVM_DEBUG(dbgs() << "    ");
-  LLVM_DEBUG(I->print(dbgs()));
-  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
-  LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
-  LLVM_DEBUG(dbgs() << "    ");
-  LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
-  LLVM_DEBUG(dbgs() << "\n");
-
-  // Erase the old instructions for the block.
-  I->eraseFromParent();
-  PrevI->eraseFromParent();
-  Update->eraseFromParent();
-
-  return NextI;
-}
-
 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
                                                MachineInstr &MI,
                                                unsigned BaseReg, int Offset) {
@@ -2201,31 +2095,6 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
   return false;
 }
 
-bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
-                                                 MachineInstr &MI,
-                                                 unsigned IndexReg,
-                                                 unsigned &Offset) {
-  // The update instruction source and destination register must be the
-  // same as the load/store index register.
-  if (MI.getOpcode() == AArch64::MOVKWi &&
-      TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
-
-    // movz + movk hold a large offset of a Ld/St instruction.
-    MachineBasicBlock::iterator B = MI.getParent()->begin();
-    MachineBasicBlock::iterator MBBI = &MI;
-    MBBI = prev_nodbg(MBBI, B);
-    MachineInstr &MovzMI = *MBBI;
-    if (MovzMI.getOpcode() == AArch64::MOVZWi) {
-      unsigned Low = MovzMI.getOperand(1).getImm();
-      unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
-      Offset = High + Low;
-      // 12-bit optionally shifted immediates are legal for adds.
-      return Offset >> 24 == 0;
-    }
-  }
-  return false;
-}
-
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2381,60 +2250,6 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
-MachineBasicBlock::iterator
-AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
-    MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
-  MachineBasicBlock::iterator B = I->getParent()->begin();
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineInstr &MemMI = *I;
-  MachineBasicBlock::iterator MBBI = I;
-
-  // If the load is the first instruction in the block, there's obviously
-  // not any matching load or store.
-  if (MBBI == B)
-    return E;
-
-  // Make sure the IndexReg is killed and the shift amount is zero.
-  // TODO: Relex this restriction to extend, simplify processing now.
-  if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
-      !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
-      (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
-    return E;
-
-  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
-
-  // Track which register units have been modified and used between the first
-  // insn (inclusive) and the second insn.
-  ModifiedRegUnits.clear();
-  UsedRegUnits.clear();
-  unsigned Count = 0;
-  do {
-    MBBI = prev_nodbg(MBBI, B);
-    MachineInstr &MI = *MBBI;
-
-    // Don't count transient instructions towards the search limit since there
-    // may be different numbers of them if e.g. debug information is present.
-    if (!MI.isTransient())
-      ++Count;
-
-    // If we found a match, return it.
-    if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
-      return MBBI;
-    }
-
-    // Update the status of what the instruction clobbered and used.
-    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
-
-    // Otherwise, if the index register is used or modified, we have no match,
-    // so return early.
-    if (!ModifiedRegUnits.available(IndexReg) ||
-        !UsedRegUnits.available(IndexReg))
-      return E;
-
-  } while (MBBI != B && Count < Limit);
-  return E;
-}
-
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -2619,34 +2434,6 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   return false;
 }
 
-bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
-                                              int Scale) {
-  MachineInstr &MI = *MBBI;
-  MachineBasicBlock::iterator E = MI.getParent()->end();
-  MachineBasicBlock::iterator Update;
-
-  // Don't know how to handle unscaled pre/post-index versions below, so bail.
-  if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
-    return false;
-
-  // Look back to try to find a const offset for index LdSt instruction. For
-  // example,
-  // mov x8, #LargeImm   ; = a * (1<<12) + imm12
-  // ldr x1, [x0, x8]
-  // merged into:
-  // add x8, x0, a * (1<<12)
-  // ldr x1, [x8, imm12]
-  unsigned Offset;
-  Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
-  if (Update != E && (Offset & (Scale - 1)) == 0) {
-    // Merge the imm12 into the ld/st.
-    MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
-    return true;
-  }
-
-  return false;
-}
-
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
                                         bool EnableNarrowZeroStOpt) {
 
@@ -2725,22 +2512,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
   }
 
-  // 5) Find a register assigned with a const value that can be combined with
-  // into the load or store. e.g.,
-  //        mov x8, #LargeImm   ; = a * (1<<12) + imm12
-  //        ldr x1, [x0, x8]
-  //        ; becomes
-  //        add x8, x0, a * (1<<12)
-  //        ldr x1, [x8, imm12]
-  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-       MBBI != E;) {
-    int Scale;
-    if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
-      Modified = true;
-    else
-      ++MBBI;
-  }
-
   return Modified;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 1e12cf545fa777..37d621cd2f6580 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -23,11 +23,13 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
+using namespace llvm::object;
 
 extern cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration;
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ea9882160d6fb2..48e1c1bc73022c 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -507,6 +507,10 @@ bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
         MCRegisterInfo::regsOverlap(PhysReg, AArch64::X16))
     return true;
 
+  // ZA/ZT0 registers are reserved but may be permitted in the clobber list.
+  if (PhysReg == AArch64::ZA || PhysReg == AArch64::ZT0)
+    return true;
+
   return !isReservedReg(MF, PhysReg);
 }
 
@@ -1015,6 +1019,8 @@ bool AArch64RegisterInfo::shouldCoalesce(
     MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg,
     const TargetRegisterClass *DstRC, unsigned DstSubReg,
     const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+  MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+
   if (MI->isCopy() &&
       ((DstRC->getID() == AArch64::GPR64RegClassID) ||
        (DstRC->getID() == AArch64::GPR64commonRegClassID)) &&
@@ -1023,5 +1029,38 @@ bool AArch64RegisterInfo::shouldCoalesce(
     // which implements a 32 to 64 bit zero extension
     // which relies on the upper 32 bits being zeroed.
     return false;
+
+  auto IsCoalescerBarrier = [](const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AArch64::COALESCER_BARRIER_FPR16:
+    case AArch64::COALESCER_BARRIER_FPR32:
+    case AArch64::COALESCER_BARRIER_FPR64:
+    case AArch64::COALESCER_BARRIER_FPR128:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  // For calls that temporarily have to toggle streaming mode as part of the
+  // call-sequence, we need to be more careful when coalescing copy instructions
+  // so that we don't end up coalescing the NEON/FP result or argument register
+  // with a whole Z-register, such that after coalescing the register allocator
+  // will try to spill/reload the entire Z register.
+  //
+  // We do this by checking if the node has any defs/uses that are
+  // COALESCER_BARRIER pseudos. These are 'nops' in practice, but they exist to
+  // instruct the coalescer to avoid coalescing the copy.
+  if (MI->isCopy() && SubReg != DstSubReg &&
+      (AArch64::ZPRRegClass.hasSubClassEq(DstRC) ||
+       AArch64::ZPRRegClass.hasSubClassEq(SrcRC))) {
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (any_of(MRI.def_instructions(SrcReg), IsCoalescerBarrier))
+      return false;
+    unsigned DstReg = MI->getOperand(0).getReg();
+    if (any_of(MRI.use_nodbg_instructions(DstReg), IsCoalescerBarrier))
+      return false;
+  }
+
   return true;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index eeae5303a3f898..acf067f2cc5a9d 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -28,6 +28,8 @@ def AArch64_restore_zt : SDNode<"AArch64ISD::RESTORE_ZT", SDTypeProfile<0, 2,
 def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
                              [SDTCisInt<0>, SDTCisPtrTy<1>]>,
                              [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+def AArch64CoalescerBarrier
+    : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, []>;
 
 //===----------------------------------------------------------------------===//
 // Instruction naming conventions.
@@ -189,6 +191,26 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
           (MSR 0xde85, GPR64:$val)>;
 def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
           (MRS 0xde85)>;
+
+multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> {
+  def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> {
+    let Constraints = "$dst = $src";
+  }
+  foreach vt = vts in {
+    def : Pat<(vt (AArch64CoalescerBarrier (vt rc:$src))),
+              (!cast<Instruction>(NAME) rc:$src)>;
+  }
+}
+
+multiclass CoalescerBarriers {
+  defm _FPR16  : CoalescerBarrierPseudo<FPR16, [bf16, f16]>;
+  defm _FPR32  : CoalescerBarrierPseudo<FPR32, [f32]>;
+  defm _FPR64  : CoalescerBarrierPseudo<FPR64, [f64, v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64, v4bf16]>;
+  defm _FPR128 : CoalescerBarrierPseudo<FPR128, [f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64, v8bf16]>;
+}
+
+defm COALESCER_BARRIER : CoalescerBarriers;
+
 } // End let Predicates = [HasSME]
 
 // Pseudo to match to smstart/smstop. This expands:
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td
index 3e4168f5f445f5..c714bad92b7fbb 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -29,7 +29,7 @@ def CortexA53Model : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
 }
 
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 277ec772cf0f10..ebbc3b72b50609 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -34,7 +34,7 @@ def CortexA57Model : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index 7edce4b61605d2..d6fe84a2c9c9b4 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -22,7 +22,8 @@ def A64FXModel : SchedMachineModel {
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F, SVEUnsupported.F,
                                                     [HasMTE, HasMatMulInt8, HasBF16,
-                                                    HasPAuth, HasPAuthLR, HasCPA]);
+                                                    HasPAuth, HasPAuthLR, HasCPA,
+                                                    HasCSSC]);
   let FullInstRWOverlapCheck = 0;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td
new file mode 100644
index 00000000000000..9c4f000cf351b2
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td
@@ -0,0 +1,1149 @@
+//=- AArch64SchedAmpere1B.td - Ampere-1B scheduling def -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the Ampere Computing Ampere-1B to
+// support instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+// The Ampere-1B core is an out-of-order micro-architecture.  The front
+// end has branch prediction, with a 10-cycle recovery time from a
+// mispredicted branch.  Instructions coming out of the front end are
+// decoded into internal micro-ops (uops).
+
+def Ampere1BModel : SchedMachineModel {
+  let IssueWidth            =  12;  // Maximum micro-ops dispatch rate.
+  let MicroOpBufferSize     = 192;  // micro-op re-order buffer size
+  let LoadLatency           =   3;  // Optimistic load latency
+  let MispredictPenalty     =  10;  // Branch mispredict penalty
+  let LoopMicroOpBufferSize =  32;  // Instruction queue size
+  let CompleteModel         =   1;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    SMEUnsupported.F,
+                                                    PAUnsupported.F);
+}
+
+let SchedModel = Ampere1BModel in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Ampere-1B.
+
+def Ampere1BUnitA  : ProcResource<2>;  // integer single-cycle, branch, and flags r/w
+def Ampere1BUnitB  : ProcResource<2>;  // integer single-cycle, and complex shifts
+def Ampere1BUnitBS : ProcResource<1>;  // integer multi-cycle
+def Ampere1BUnitL  : ProcResource<2>;  // load
+def Ampere1BUnitS  : ProcResource<2>;  // store address calculation
+def Ampere1BUnitX  : ProcResource<1>;  // FP and vector operations, and flag write
+def Ampere1BUnitY  : ProcResource<1>;  // FP and vector operations, and crypto
+def Ampere1BUnitZ  : ProcResource<1>;  // FP store data and FP-to-integer moves
+
+def Ampere1BUnitAB : ProcResGroup<[Ampere1BUnitA, Ampere1BUnitB]>;
+def Ampere1BUnitXY : ProcResGroup<[Ampere1BUnitX, Ampere1BUnitY]>;
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Ampere-1.
+
+def Ampere1BWrite_1cyc_1A : SchedWriteRes<[Ampere1BUnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_2A : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_1cyc_1B : SchedWriteRes<[Ampere1BUnitB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_1BS_1B : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitB]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_1cyc_1AB : SchedWriteRes<[Ampere1BUnitAB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_1AB_1A : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_1cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_1S : SchedWriteRes<[Ampere1BUnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_1cyc_2S : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_2cyc_1Y : SchedWriteRes<[Ampere1BUnitY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_2cyc_2AB : SchedWriteRes<[Ampere1BUnitAB, Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_2cyc_1B_1AB : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_2cyc_1B_1S : SchedWriteRes<[Ampere1BUnitB, Ampere1BUnitS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_2cyc_1B_1S_1AB : SchedWriteRes<[Ampere1BUnitB,
+                                                  Ampere1BUnitS,
+                                                  Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_2cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS,
+                                              Ampere1BUnitZ,
+                                              Ampere1BUnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_2cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_2cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_3cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1Z : SchedWriteRes<[Ampere1BUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_3cyc_1S_1Z : SchedWriteRes<[Ampere1BUnitS,
+                                              Ampere1BUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_3cyc_1S_2Z : SchedWriteRes<[Ampere1BUnitS,
+                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_3cyc_2S_2Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
+                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def Ampere1BWrite_4cyc_1BS_1AB : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitAB]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_4cyc_1L : SchedWriteRes<[Ampere1BUnitL]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_4cyc_2L : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitL]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_4cyc_1L_1B : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitB]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_4cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_4cyc_1XY : SchedWriteRes<[Ampere1BUnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_4cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_5cyc_1BS : SchedWriteRes<[Ampere1BUnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1BUnitXY,
+                                                  Ampere1BUnitS,
+                                                  Ampere1BUnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_4cyc_3S_3Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
+                                              Ampere1BUnitS, Ampere1BUnitZ,
+                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 6;
+}
+
+def Ampere1BWrite_5cyc_4S_4Z : SchedWriteRes<[Ampere1BUnitS, Ampere1BUnitS,
+                                              Ampere1BUnitS, Ampere1BUnitS,
+                                              Ampere1BUnitZ, Ampere1BUnitZ,
+                                              Ampere1BUnitZ, Ampere1BUnitZ]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+}
+
+def Ampere1BWrite_5cyc_1L_1BS : SchedWriteRes<[Ampere1BUnitL,
+                                               Ampere1BUnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_5cyc_3L : SchedWriteRes<[Ampere1BUnitL,
+                                           Ampere1BUnitL,
+                                           Ampere1BUnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_5cyc_4L : SchedWriteRes<[Ampere1BUnitL,
+                                           Ampere1BUnitL,
+                                           Ampere1BUnitL,
+                                           Ampere1BUnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+}
+
+def Ampere1BWrite_5cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 5;
+  let NumMicroOps = 6;
+}
+
+def Ampere1BWrite_6cyc_1BS_1A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_6cyc_1BS_2A : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitA,
+                                                               Ampere1BUnitA]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_6cyc_1L_1XY : SchedWriteRes<[Ampere1BUnitL, Ampere1BUnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_6cyc_2L_2XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
+                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+def Ampere1BWrite_6cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_6cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_6cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                            Ampere1BUnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 6;
+}
+
+def Ampere1BWrite_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 9;
+}
+
+def Ampere1BWrite_7cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_7cyc_1XY_1Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitZ]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_7cyc_1X_1Z : SchedWriteRes<[Ampere1BUnitX, Ampere1BUnitZ]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_7cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
+                                               Ampere1BUnitL,  Ampere1BUnitXY,
+                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 6;
+}
+
+def Ampere1BWrite_7cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
+                                               Ampere1BUnitL,  Ampere1BUnitL,
+                                               Ampere1BUnitXY, Ampere1BUnitXY,
+                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 8;
+}
+
+def Ampere1BWrite_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 7;
+  let NumMicroOps = 12;
+}
+
+def Ampere1BWrite_8cyc_1BS_1L : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitL]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_8cyc_1BS_1XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_8cyc_2L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
+                                               Ampere1BUnitXY, Ampere1BUnitXY,
+                                               Ampere1BUnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+}
+
+def Ampere1BWrite_8cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
+                                               Ampere1BUnitL,  Ampere1BUnitXY,
+                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 6;
+}
+
+def Ampere1BWrite_8cyc_4L_4XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
+                                               Ampere1BUnitL,  Ampere1BUnitL,
+                                               Ampere1BUnitXY, Ampere1BUnitXY,
+                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 8;
+}
+
+def Ampere1BWrite_8cyc_2XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_8cyc_4XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                            Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+def Ampere1BWrite_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitXY, Ampere1BUnitXY,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitS,  Ampere1BUnitS,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ,
+                                                  Ampere1BUnitZ,  Ampere1BUnitZ]> {
+  let Latency = 9;
+  let NumMicroOps = 14;
+}
+
+def Ampere1BWrite_9cyc_1A_1BS_1X : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitX]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_9cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1BUnitA, Ampere1BUnitBS, Ampere1BUnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_9cyc_3L_3XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
+                                               Ampere1BUnitL,  Ampere1BUnitXY,
+                                               Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 6;
+}
+
+def Ampere1BWrite_9cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 9;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_9cyc_3XY : SchedWriteRes<[Ampere1BUnitXY, Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_10cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
+                                                Ampere1BUnitL,  Ampere1BUnitL,
+                                                Ampere1BUnitXY, Ampere1BUnitXY,
+                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 12;
+}
+
+def Ampere1BWrite_11cyc_1BS_2XY : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_11cyc_4L_8XY : SchedWriteRes<[Ampere1BUnitL,  Ampere1BUnitL,
+                                                Ampere1BUnitL,  Ampere1BUnitL,
+                                                Ampere1BUnitXY, Ampere1BUnitXY,
+                                                Ampere1BUnitXY, Ampere1BUnitXY]> {
+  let Latency = 11;
+  let NumMicroOps = 12;
+}
+
+def Ampere1BWrite_12cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 12;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_13cyc_1BS_1X : SchedWriteRes<[Ampere1BUnitBS, Ampere1BUnitX]> {
+  let Latency = 13;
+  let NumMicroOps = 2;
+}
+
+def Ampere1BWrite_17cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 17;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_19cyc_2BS_1X : SchedWriteRes<[Ampere1BUnitBS,
+                                                Ampere1BUnitBS,
+                                                Ampere1BUnitX]> {
+  let Latency = 13;
+  let NumMicroOps = 3;
+}
+
+def Ampere1BWrite_19cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 19;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_21cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 21;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_33cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 33;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_39cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 39;
+  let NumMicroOps = 1;
+}
+
+def Ampere1BWrite_63cyc_1X : SchedWriteRes<[Ampere1BUnitX]> {
+  let Latency = 63;
+  let NumMicroOps = 1;
+}
+
+// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
+// which are a single uop, and for extended registers, which have full flexibility
+// across Unit A or B for both uops.
+def Ampere1BWrite_Arith : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1BWrite_2cyc_2AB]>,
+                                SchedVar<IsCheapLSL,      [Ampere1BWrite_1cyc_1AB]>,
+                                SchedVar<NoSchedPred,     [Ampere1BWrite_2cyc_1B_1AB]>]>;
+
+def Ampere1BWrite_ArithFlagsetting : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1BWrite_2cyc_2AB]>,
+                                SchedVar<IsCheapLSL,      [Ampere1BWrite_1cyc_1AB]>,
+                                SchedVar<NoSchedPred,     [Ampere1BWrite_2cyc_1B_1AB]>]>;
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latencies for Ampere-1.
+// This provides a coarse model, which is then specialised below.
+
+def : WriteRes<WriteImm,   [Ampere1BUnitAB]>;  // MOVN, MOVZ
+def : WriteRes<WriteI,     [Ampere1BUnitAB]>;  // ALU
+def : WriteRes<WriteISReg, [Ampere1BUnitB, Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [Ampere1BUnitAB, Ampere1BUnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Extended-Reg
+def : WriteRes<WriteExtr,  [Ampere1BUnitB]>;  // EXTR shifts a reg pair
+def : WriteRes<WriteIS,    [Ampere1BUnitB]>;  // Shift/Scale
+def : WriteRes<WriteID32,  [Ampere1BUnitBS, Ampere1BUnitX]> {
+  let Latency = 13;
+}  // 32-bit Divide
+def : WriteRes<WriteID64,  [Ampere1BUnitBS, Ampere1BUnitX]> {
+  let Latency = 19;
+}  // 64-bit Divide
+def : WriteRes<WriteIM32,  [Ampere1BUnitBS]> {
+  let Latency = 3;
+}  // 32-bit Multiply
+def : WriteRes<WriteIM64,  [Ampere1BUnitBS, Ampere1BUnitAB]> {
+  let Latency = 3;
+}  // 64-bit Multiply
+def : WriteRes<WriteBr,    [Ampere1BUnitA]>;
+def : WriteRes<WriteBrReg, [Ampere1BUnitA, Ampere1BUnitA]>;
+def : WriteRes<WriteLD,    [Ampere1BUnitL]> {
+  let Latency = 3;
+}  // Load from base addr plus immediate offset
+def : WriteRes<WriteST,    [Ampere1BUnitS]> {
+  let Latency = 1;
+}  // Store to base addr plus immediate offset
+def : WriteRes<WriteSTP,   [Ampere1BUnitS, Ampere1BUnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}  // Store a register pair.
+def : WriteRes<WriteAdr,   [Ampere1BUnitAB]>;
+def : WriteRes<WriteLDIdx, [Ampere1BUnitAB, Ampere1BUnitS]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}  // Load from a register index (maybe scaled).
+def : WriteRes<WriteSTIdx, [Ampere1BUnitS, Ampere1BUnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}  // Store to a register index (maybe scaled).
+def : WriteRes<WriteF,  [Ampere1BUnitXY]> {
+  let Latency = 2;
+}  // General floating-point ops.
+def : WriteRes<WriteFCmp,  [Ampere1BUnitX]> {
+  let Latency = 3;
+}  // Floating-point compare.
+def : WriteRes<WriteFCvt,  [Ampere1BUnitXY]> {
+  let Latency = 3;
+}  // Float conversion.
+def : WriteRes<WriteFCopy, [Ampere1BUnitXY]> {
+}  // Float-int register copy.
+def : WriteRes<WriteFImm,  [Ampere1BUnitXY]> {
+  let Latency = 2;
+}  // Float-int register copy.
+def : WriteRes<WriteFMul,  [Ampere1BUnitXY]> {
+  let Latency = 4;
+}  // Floating-point multiply.
+def : WriteRes<WriteFDiv,  [Ampere1BUnitXY]> {
+  let Latency = 19;
+}  // Floating-point division.
+def : WriteRes<WriteVd,    [Ampere1BUnitXY]> {
+  let Latency = 3;
+}  // 64bit Vector D ops.
+def : WriteRes<WriteVq,    [Ampere1BUnitXY]> {
+  let Latency = 3;
+}  // 128bit Vector Q ops.
+def : WriteRes<WriteVLD,   [Ampere1BUnitL, Ampere1BUnitL]> {
+  let Latency = 4;
+}  // Vector loads.
+def : WriteRes<WriteVST,   [Ampere1BUnitS, Ampere1BUnitZ]> {
+  let Latency = 2;
+}  // Vector stores.
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 3;
+}  // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
+
+// Forwarding logic.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadST,      0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// Specialising the scheduling model further for Ampere-1B.
+
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs COPY)>;
+
+// Branch instructions
+def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs Bcc, BL, RET)>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+def : InstRW<[Ampere1BWrite_1cyc_2A], (instrs BLR)>;
+
+// Common Short Sequence Compression (CSSC)
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instregex "^ABS[WX]")>;
+def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CNT[WX]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1B], (instregex "^CTZ[WX]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1AB_1A], (instregex "^[SU](MAX|MIN)[WX]")>;
+
+// Cryptography instructions
+// -- AES encryption/decryption
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AES[DE]")>;
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^AESI?MC")>;
+// -- Polynomial multiplication
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
+// -- SHA-256 hash
+def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA256(H|H2)")>;
+// -- SHA-256 schedule update
+def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA256SU[01]")>;
+// -- SHA-3 instructions
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
+// -- SHA-512 hash
+def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA512(H|H2)")>;
+// -- SHA-512 schedule update
+def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA512SU[01]")>;
+// -- SHA1 choose/majority/parity
+def : InstRW<[Ampere1BWrite_4cyc_1X], (instregex "^SHA1[CMP]")>;
+// -- SHA1 hash/schedule update
+def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1SU[01]")>;
+def : InstRW<[Ampere1BWrite_2cyc_1Y], (instregex "^SHA1H")>;
+// -- SM3 hash
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+    (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$")>;
+def : InstRW<[Ampere1BWrite_4cyc_1X], (instrs SM4E, SM4ENCKEY)>;
+
+// FP and vector load instructions
+// -- Load 1-element structure to one/all lanes
+// ---- all lanes
+def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
+        (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// ---- one lane
+def : InstRW<[Ampere1BWrite_6cyc_1L_1XY],
+        (instregex "^LD1i(8|16|32|64)")>;
+// -- Load 1-element structure to one/all lanes, 1D size
+def : InstRW<[Ampere1BWrite_4cyc_1L],
+        (instregex "^LD1Rv1d")>;
+// -- Load 1-element structures to 1 register
+def : InstRW<[Ampere1BWrite_4cyc_1L],
+        (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 2 registers
+def : InstRW<[Ampere1BWrite_4cyc_2L],
+        (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 3 registers
+def : InstRW<[Ampere1BWrite_5cyc_3L],
+        (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 4 registers
+def : InstRW<[Ampere1BWrite_5cyc_4L],
+        (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to all lanes of 2 registers, 1D size
+def : InstRW<[Ampere1BWrite_4cyc_2L],
+        (instregex "^LD2Rv1d")>;
+// -- Load 2-element structure to all lanes of 2 registers, other sizes
+def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
+        (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to one lane of 2 registers
+def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
+        (instregex "^LD2i(8|16|32|64)")>;
+// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
+def : InstRW<[Ampere1BWrite_6cyc_2L_2XY],
+        (instregex "^LD2Twov(16b|8h|4s|2d)")>;
+// -- Load 2-element structures to 2 registers, 8B/4H/2S size
+def : InstRW<[Ampere1BWrite_8cyc_2L_3XY],
+        (instregex "^LD2Twov(8b|4h|2s)")>;
+// -- Load 3-element structure to all lanes of 3 registers, 1D size
+def : InstRW<[Ampere1BWrite_5cyc_3L],
+        (instregex "^LD3Rv1d")>;
+// -- Load 3-element structure to all lanes of 3 registers, other sizes
+def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
+        (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 3-element structure to one lane of 3 registers
+def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
+        (instregex "^LD3i(8|16|32|64)")>;
+// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1BWrite_8cyc_3L_3XY],
+        (instregex "^LD3Threev(16b|8h|4s)")>;
+// -- Load 3-element structures to 3 registers, 2D size
+def : InstRW<[Ampere1BWrite_7cyc_3L_3XY],
+        (instregex "^LD3Threev2d")>;
+// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1BWrite_9cyc_3L_3XY],
+        (instregex "^LD3Threev(8b|4h|2s)")>;
+// -- Load 4-element structure to all lanes of 4 registers, 1D size
+def : InstRW<[Ampere1BWrite_5cyc_4L],
+        (instregex "^LD4Rv1d")>;
+// -- Load 4-element structure to all lanes of 4 registers, other sizes
+def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
+        (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 4-element structure to one lane of 4 registers
+def : InstRW<[Ampere1BWrite_7cyc_4L_4XY],
+        (instregex "^LD4i(8|16|32|64)")>;
+// -- Load 4-element structures to 4 registers, 2D size
+def : InstRW<[Ampere1BWrite_8cyc_4L_4XY],
+        (instregex "^LD4Fourv2d")>;
+// -- Load 4-element structures to 4 registers, 2S size
+def : InstRW<[Ampere1BWrite_11cyc_4L_8XY],
+        (instregex "^LD4Fourv2s")>;
+// -- Load 4-element structures to 4 registers, other sizes
+def : InstRW<[Ampere1BWrite_10cyc_4L_8XY],
+        (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
+// -- Load pair, Q-form
+def : InstRW<[Ampere1BWrite_4cyc_2L], (instregex "LDN?PQ")>;
+// -- Load pair, S/D-form
+def : InstRW<[Ampere1BWrite_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
+// -- Load register
+def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
+// -- Load register, sign-extended register
+def : InstRW<[Ampere1BWrite_4cyc_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
+
+// FP and vector store instructions
+// -- Store 1-element structure from one lane of 1 register
+def : InstRW<[Ampere1BWrite_4cyc_1XY_1S_1Z],
+        (instregex "^ST1i(8|16|32|64)")>;
+// -- Store 1-element structures from 1 register
+def : InstRW<[Ampere1BWrite_2cyc_1S_1Z],
+        (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 2 registers
+def : InstRW<[Ampere1BWrite_3cyc_2S_2Z],
+        (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 3 registers
+def : InstRW<[Ampere1BWrite_4cyc_3S_3Z],
+        (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 4 registers
+def : InstRW<[Ampere1BWrite_5cyc_4S_4Z],
+        (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 2-element structure from one lane of 2 registers
+def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
+        (instregex "^ST2i(8|16|32|64)")>;
+// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
+def : InstRW<[Ampere1BWrite_5cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(16b|8h|4s|2d)")>;
+// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1BWrite_6cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(8b|4h|2s)")>;
+// -- Store 3-element structure from one lane of 3 registers
+def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
+        (instregex "^ST3i(8|16|32|64)")>;
+// -- Store 3-element structures from 3 registers
+def : InstRW<[Ampere1BWrite_6cyc_3XY_3S_3Z],
+        (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 4-element structure from one lane of 4 registers
+def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
+        (instregex "^ST4i(8|16|32|64)")>;
+// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
+        (instregex "^ST4Fourv(16b|8h|4s)")>;
+// -- Store 4-element structures from 4 registers, 2D sizes
+def : InstRW<[Ampere1BWrite_7cyc_4XY_4S_4Z],
+        (instregex "^ST4Fourv2d")>;
+// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1BWrite_9cyc_6XY_4S_4Z],
+        (instregex "^ST4Fourv(8b|4h|2s)")>;
+// -- Store pair, Q-form
+def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?PQ")>;
+// -- Store pair, S/D-form
+def : InstRW<[Ampere1BWrite_3cyc_2S_2Z], (instregex "^STN?P[SD]")>;
+// -- Store register
+def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>;
+// -- Store register, sign-extended register offset
+def : InstRW<[Ampere1BWrite_2cyc_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
+
+// FP data processing, bfloat16 format
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFCVT)>;
+def : InstRW<[Ampere1BWrite_8cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instrs BFMMLA)>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^BFMLAL")>;
+
+// FP data processing, scalar/vector, half precision
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1X],
+        (instregex "^FCMPE?H")>;
+def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?H")>;
+def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
+        (instregex "^FCSELH")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
+// Convert FP to integer, H-form
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi]16")>;
+// Convert to FP from GPR, H-form
+def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]toH$")>;
+// Convert to FP from GPR, fixed-point, H-form
+def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX]Hri$")>;
+def : InstRW<[Ampere1BWrite_9cyc_1X], (instrs FDIVHrr)>;
+def : InstRW<[Ampere1BWrite_17cyc_1X], (instregex "^FDIVv.[if]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
+def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
+def : InstRW<[Ampere1BWrite_9cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX16)>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if]16")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
+// FP square root, H-form
+def : InstRW<[Ampere1BWrite_21cyc_1X], (instrs FSQRTHr)>;
+// FP square root, vector-form, F16
+def : InstRW<[Ampere1BWrite_39cyc_1X], (instregex "^FSQRTv.f16")>;
+
+// FP data processing, scalar/vector, single/double precision
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1X],
+        (instregex "^FCMPE?(S|D)")>;
+def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?(S|D)")>;
+def : InstRW<[Ampere1BWrite_9cyc_1A_1BS_1XY],
+        (instregex "^FCSEL(S|D)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
+// Convert FP to integer, S/D-form
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^[SUd]CVTFv.[fi](32|64)")>;
+// Convert to FP from GPR, S/D-form
+def : InstRW<[Ampere1BWrite_8cyc_1BS_1XY], (instregex "^[SU]CVTF_ZPmZ_[DSH]to[DS]$")>;
+// Convert to FP from GPR, fixed-point, S/D-form
+def : InstRW<[Ampere1BWrite_11cyc_1BS_2XY], (instregex "^[SU]CVTF[SU][WX][SD]ri$")>;
+def : InstRW<[Ampere1BWrite_19cyc_1X], (instregex "^FDIVv.[if](64)", "FDIVD")>;
+def : InstRW<[Ampere1BWrite_12cyc_1X], (instregex "^FDIVv.[if](32)", "FDIVS")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULX32, FMULX64)>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULSrr, FNMULSrr)>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instrs FMULDrr, FNMULDrr)>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
+def : InstRW<[Ampere1BWrite_4cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FRINT(32|64)")>;
+def : InstRW<[Ampere1BWrite_63cyc_1X], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
+def : InstRW<[Ampere1BWrite_33cyc_1X], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
+
+// FP miscellaneous instructions
+def : InstRW<[Ampere1BWrite_7cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVTLv")>;
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^FCVT(N|XN)v")>;
+def : InstRW<[Ampere1BWrite_7cyc_1X_1Z], (instrs FJCVTZS)>;
+def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
+def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
+def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "^FMOVXDHighr")>;
+def : InstRW<[Ampere1BWrite_3cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
+
+// Integer arithmetic and logical instructions
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "ADC(W|X)r", "SBC(W|X)r")>;
+def : InstRW<[Ampere1BWrite_Arith],
+        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1AB],
+        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[ri]")>;
+def : InstRW<[Ampere1BWrite_ArithFlagsetting],
+        (instregex "(ADD|AND|BIC|SUB)S[WX]r[sx]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "(ADD|AND|BIC|SUB)S[WX]r[ri]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "(ADC|SBC)S[WX]r")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A], (instrs RMIF)>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "(CCMN|CCMP)(X|W)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],
+        (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
+def : InstRW<[Ampere1BWrite_13cyc_1BS_1X], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[Ampere1BWrite_19cyc_2BS_1X], (instrs SDIVXr, UDIVXr)>;
+def : InstRW<[Ampere1BWrite_3cyc_1BS],
+        (instregex "(S|U)MULHr")>;
+def : InstRW<[Ampere1BWrite_4cyc_1BS_1AB],
+        (instregex "(S|U)?M(ADD|SUB)L?r")>;
+
+// Integer load instructions
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDR(B|D|H|Q|S)ui")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDR(D|Q|W|X)l")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDTR(B|H|W|X)i")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDURS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1BWrite_3cyc_1L],
+        (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1L],
+        (instrs PRFMl, PRFUMi, PRFUMi)>;
+def : InstRW<[Ampere1BWrite_1cyc_1L],
+        (instrs PRFMroW, PRFMroX)>;
+
+// Integer miscellaneous instructions
+def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs ADR, ADRP)>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "EXTR(W|X)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "(S|U)?BFM(W|X)")>;
+def : InstRW<[Ampere1BWrite_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],  (instregex "CLS(W|X)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1A],  (instrs SETF8, SETF16)>;
+def : InstRW<[Ampere1BWrite_1cyc_1AB],
+        (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],
+        (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
+def : InstRW<[Ampere1BWrite_1cyc_1B],
+        (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
+
+// Integer store instructions
+def : InstRW<[Ampere1BWrite_1cyc_2S],        (instregex "STNP(X|W)i")>;
+def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STPXi)>;
+def : InstRW<[Ampere1BWrite_2cyc_1B_1S],     (instrs STPWi)>;
+def : InstRW<[Ampere1BWrite_2cyc_1B_1S_1AB], (instregex "STP(W|X)(pre|post)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1S],        (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
+def : InstRW<[Ampere1BWrite_1cyc_1S],        (instregex "STUR(BB|HH|X|W)i",
+                                                        "STR(X|W)ui",
+                                                        "STUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroX, STRXroX)>;
+def : InstRW<[Ampere1BWrite_1cyc_2S],        (instrs STRWroW, STRXroW)>;
+
+// Memory tagging
+
+// Insert Random Tags
+def : InstRW<[Ampere1BWrite_1cyc_1BS_1B], (instrs IRG, IRGstack)>;
+// Load allocation tag
+def : InstRW<[Ampere1BWrite_4cyc_1L_1B], (instrs LDG, LDGM)>;
+// Store allocation tags
+def : InstRW<[Ampere1BWrite_1cyc_1S],
+    (instrs STGi, STGM, STGPreIndex, STGPostIndex)>;
+// Store allocation tags and pair of registers
+def : InstRW<[Ampere1BWrite_1cyc_2S],
+    (instrs STGPi, STGPpre, STGPpost)>;
+// Store allocation tags and zero data
+def : InstRW<[Ampere1BWrite_1cyc_1S],
+    (instrs STZGi, STZGM, STZGPreIndex, STZGPostIndex)>;
+// Store two tags
+def : InstRW<[Ampere1BWrite_1cyc_2S],
+    (instrs ST2Gi, ST2GPreIndex, ST2GPostIndex)>;
+// Store two tags and zero data
+def : InstRW<[Ampere1BWrite_1cyc_2S],
+    (instrs STZ2Gi, STZ2GPreIndex, STZ2GPostIndex)>;
+// Subtract Pointer
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBP)>;
+// Subtract Pointer, flagset
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs SUBPS)>;
+// Insert Tag Mask
+def : InstRW<[Ampere1BWrite_1cyc_1AB], (instrs GMI)>;
+// Arithmetic, immediate to logical address tag
+def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs ADDG, SUBG)>;
+
+// Pointer authentication
+def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^AUT")>;
+def : InstRW<[Ampere1BWrite_6cyc_1BS_1A],
+        (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
+def : InstRW<[Ampere1BWrite_6cyc_1BS_2A],
+        (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
+def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^PAC")>;
+def : InstRW<[Ampere1BWrite_8cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
+def : InstRW<[Ampere1BWrite_1cyc_1B], (instrs XPACD, XPACI)>;
+
+// Vector integer instructions
+// -- absolute difference
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+             (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
+                        "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
+// -- arithmetic
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
+                   "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
+                   "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
+// -- arithmetic, horizontal, 16B
+def : InstRW<[Ampere1BWrite_8cyc_4XY],
+            (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
+def : InstRW<[Ampere1BWrite_8cyc_4XY],
+            (instregex "^[SU](MIN|MAX)Vv16i8v")>;
+// -- arithmetic, horizontal, 4H/4S
+def : InstRW<[Ampere1BWrite_4cyc_2XY],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
+def : InstRW<[Ampere1BWrite_4cyc_2XY],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
+// -- arithmetic, horizontal, 8B/8H
+def : InstRW<[Ampere1BWrite_6cyc_3XY],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
+def : InstRW<[Ampere1BWrite_6cyc_3XY],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
+// -- arithmetic, narrowing
+def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[Ampere1BWrite_6cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
+// -- arithmetic, pairwise
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
+// -- arithmetic, saturating
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
+// -- bit count
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^(CLS|CLZ|CNT)v")>;
+// -- compare
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
+                   "^CMHIv", "^CMHSv")>;
+// -- compare non-zero
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^CMTSTv")>;
+// -- dot product
+def : InstRW<[Ampere1BWrite_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
+// -- fp reciprocal estimate
+def : InstRW<[Ampere1BWrite_6cyc_1X], (instregex "^FRECPEv", "^FRSQRTEv")>;
+// -- integer reciprocal estimate
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
+// -- logical
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+// -- logical, narrowing
+def : InstRW<[Ampere1BWrite_6cyc_2XY],
+        (instregex "RSHRNv",
+                   "SHRNv", "SQSHRNv", "SQSHRUNv",
+                   "UQXTNv")>;
+// -- matrix multiply
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instrs SMMLA, UMMLA, USMMLA)>;
+// -- max/min
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
+// -- move immediate
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
+// -- multiply
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
+// -- multiply accumulate
+def : InstRW<[Ampere1BWrite_3cyc_1XY],
+        (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
+// -- negation, saturating
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
+// -- reverse bits/bytes
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
+// -- shift
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// -- shift and accumulate
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
+// -- shift, saturating
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+        (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
+                   "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
+                   "^UQSHL")>;
+
+// Vector miscellaneous instructions
+// -- duplicate element
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^DUPv.+lane")>;
+// -- duplicate from GPR
+def : InstRW<[Ampere1BWrite_5cyc_1BS], (instregex "^DUPv.+gpr")>;
+// -- extract narrow
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^XTNv")>;
+// -- insert/extract element
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
+// -- move FP immediate
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^FMOVv")>;
+// -- move element to GPR
+def : InstRW<[Ampere1BWrite_5cyc_1X], (instregex "(S|U)MOVv")>;
+// -- move from GPR to any element
+def : InstRW<[Ampere1BWrite_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
+// -- table lookup
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+            (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
+def : InstRW<[Ampere1BWrite_4cyc_2XY],
+            (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
+def : InstRW<[Ampere1BWrite_6cyc_3XY],
+            (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
+def : InstRW<[Ampere1BWrite_8cyc_4XY],
+            (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
+// -- transpose
+def : InstRW<[Ampere1BWrite_2cyc_1XY],
+              (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
+// -- zip/unzip
+def : InstRW<[Ampere1BWrite_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
+
+} // SchedModel = Ampere1BModel
diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
index 1ef3a2a063382d..48324654949c06 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -21,7 +21,7 @@ def CycloneModel : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
index 2127a34a58d513..6fc4ec3ae41b77 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -27,7 +27,7 @@ def ExynosM3Model : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index 83cf56088d4ced..5163de280f2e4f 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -27,7 +27,7 @@ def ExynosM4Model : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index 85058af86decb5..2ccbe1614dcd79 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -27,7 +27,7 @@ def ExynosM5Model : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
index a765cd1cdfe347..e9172e82b099d1 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -26,7 +26,7 @@ def FalkorModel : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
   // FIXME: Remove when all errors have been fixed.
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
index 3551066ee7c35d..258b34c38898cd 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
@@ -30,7 +30,7 @@ def KryoModel : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
   // FIXME: Remove when all errors have been fixed.
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 2ec9600f84f7e5..524fa33f498bb0 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -25,7 +25,7 @@ def NeoverseN1Model : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(PAUnsupported.F,
                                                     SMEUnsupported.F,
                                                     SVEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index a6fab5e6245f80..8ec124954362f8 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -19,7 +19,7 @@ def NeoverseN2Model : SchedMachineModel {
   let CompleteModel         =   1;
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
-    [HasSVE2p1, HasPAuthLR, HasCPA]);
+    [HasSVE2p1, HasPAuthLR, HasCPA, HasCSSC]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index 75fbb85dce9d14..613db353cb0aaa 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -28,7 +28,8 @@ def NeoverseV1Model : SchedMachineModel {
 
   list<Predicate> UnsupportedFeatures = !listconcat(SVE2Unsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE, HasCPA]);
+                                                    [HasMTE, HasCPA,
+                                                    HasCSSC]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 658d7cdd23a63b..e7de40fdf1deb0 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -22,7 +22,8 @@ def NeoverseV2Model : SchedMachineModel {
   let CompleteModel         =   1;
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
-                                                    [HasSVE2p1, HasCPA]);
+                                                    [HasSVE2p1, HasCPA,
+                                                    HasCSSC]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
index 9e5060f1f36496..0ae9a69fd48265 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -27,7 +27,7 @@ def TSV110Model : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
 }
 
 // Define each kind of processor resource and number available on the TSV110,
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
index e1536f208e448a..8df3f56e45738c 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -28,7 +28,7 @@ def ThunderXT8XModel : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
   // FIXME: Remove when all errors have been fixed.
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 89faa92155e00d..ef4baa3dedff93 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -28,7 +28,7 @@ def ThunderX2T99Model : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
   // FIXME: Remove when all errors have been fixed.
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
index 8685554b00d76d..796bd4b8b5c9ae 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
@@ -27,7 +27,7 @@ def ThunderX3T110Model : SchedMachineModel {
   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
                                                     PAUnsupported.F,
                                                     SMEUnsupported.F,
-                                                    [HasMTE]);
+                                                    [HasMTE, HasCSSC]);
   // FIXME: Remove when all errors have been fixed.
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index e3a0606331db1c..dd4c0e2eb64249 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -296,6 +296,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     break;
   case Ampere1:
   case Ampere1A:
+  case Ampere1B:
     CacheLineSize = 64;
     PrefFunctionAlignment = Align(64);
     PrefLoopAlignment = Align(64);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 16864102df59b0..f8dcbe97b6321a 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -42,6 +42,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
     A64FX,
     Ampere1,
     Ampere1A,
+    Ampere1B,
     AppleA7,
     AppleA10,
     AppleA11,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d611338fc268f9..992b11da7eeee5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -233,15 +233,20 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
 
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
-  SMEAttrs CallerAttrs(*Caller);
-  SMEAttrs CalleeAttrs(*Callee);
+  SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
+
+  // When inlining, we should consider the body of the function, not the
+  // interface.
+  if (CalleeAttrs.hasStreamingBody()) {
+    CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
+    CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
+  }
+
   if (CalleeAttrs.hasNewZABody())
     return false;
 
   if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
-      (CallerAttrs.requiresSMChange(CalleeAttrs) &&
-       (!CallerAttrs.hasStreamingInterfaceOrBody() ||
-        !CalleeAttrs.hasStreamingBody()))) {
+      CallerAttrs.requiresSMChange(CalleeAttrs)) {
     if (hasPossibleIncompatibleOps(Callee))
       return false;
   }
@@ -4062,4 +4067,4 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
       cast<BranchInst>(I->getNextNode())->isUnconditional())
     return true;
   return BaseT::shouldTreatInstructionLikeSelect(I);
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index fd69a7d6c33d03..4b9d549e791142 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -622,9 +622,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lowerIf([=](const LegalityQuery &Query) {
         LLT DstTy = Query.Types[0];
         LLT SrcTy = Query.Types[1];
-        return DstTy.isVector() && (SrcTy.getSizeInBits() > 128 ||
-                                    (DstTy.getScalarSizeInBits() * 2 <
-                                     SrcTy.getScalarSizeInBits()));
+        return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
+               DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
       })
 
       .alwaysLegal();
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 10e69655f77e10..8b32d593d2a812 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -248,34 +248,6 @@ static inline bool atomicBarrierDroppedOnZero(unsigned Opcode) {
   return false;
 }
 
-static inline std::optional<std::string>
-getArm64ECMangledFunctionName(std::string Name) {
-  bool IsCppFn = Name[0] == '?';
-  if (IsCppFn && Name.find("$$h") != std::string::npos)
-    return std::nullopt;
-  if (!IsCppFn && Name[0] == '#')
-    return std::nullopt;
-
-  StringRef Prefix = "$$h";
-  size_t InsertIdx = 0;
-  if (IsCppFn) {
-    InsertIdx = Name.find("@@");
-    size_t ThreeAtSignsIdx = Name.find("@@@");
-    if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) {
-      InsertIdx += 2;
-    } else {
-      InsertIdx = Name.find("@");
-      if (InsertIdx != std::string::npos)
-        InsertIdx++;
-    }
-  } else {
-    Prefix = "#";
-  }
-
-  Name.insert(Name.begin() + InsertIdx, Prefix.begin(), Prefix.end());
-  return std::optional<std::string>(Name);
-}
-
 namespace AArch64CC {
 
 // The CondCodes constants map directly to the 4-bit encoding of the condition
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index cb29d5d9475981..250e3e350c02e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1506,6 +1506,7 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureFlatAtomicFaddF32Inst,
    FeatureImageInsts,
    FeatureExtendedImageInsts,
+   FeatureFP8ConversionInsts,
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard,
    FeatureSALUFloatInsts,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index a19b03b9292337..152f495a452ba2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -59,6 +59,30 @@ def gi_wmmaopselvop3pmods :
     GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
     GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
 
+def gi_wmmavisrc :
+    GIComplexOperandMatcher<s32, "selectWMMAVISrc">,
+    GIComplexPatternEquiv<WMMAVISrc>;
+
+def gi_wmmamods :
+    GIComplexOperandMatcher<s32, "selectWMMAModsF32NegAbs">,
+    GIComplexPatternEquiv<WMMAModsF32NegAbs>;
+
+def gi_wmmamodsf16Neg :
+    GIComplexOperandMatcher<s32, "selectWMMAModsF16Neg">,
+    GIComplexPatternEquiv<WMMAModsF16Neg>;
+
+def gi_wmmamodsf16NegAbs :
+    GIComplexOperandMatcher<s32, "selectWMMAModsF16NegAbs">,
+    GIComplexPatternEquiv<WMMAModsF16NegAbs>;
+
+def gi_swmmacindex8 :
+    GIComplexOperandMatcher<s32, "selectSWMMACIndex8">,
+    GIComplexPatternEquiv<SWMMACIndex8>;
+
+def gi_swmmacindex16 :
+    GIComplexOperandMatcher<s32, "selectSWMMACIndex16">,
+    GIComplexPatternEquiv<SWMMACIndex16>;
+
 def gi_vop3opselmods :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
     GIComplexPatternEquiv<VOP3OpSelMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 4c35649cec6c8a..4f7bf3f7d35e71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3048,6 +3048,336 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
   return true;
 }
 
+static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
+                                         llvm::SelectionDAG *CurDAG,
+                                         const SDLoc &DL) {
+  unsigned DstRegClass;
+  EVT DstTy;
+  switch (Elts.size()) {
+  case 8:
+    DstRegClass = AMDGPU::VReg_256RegClassID;
+    DstTy = MVT::v8i32;
+    break;
+  case 4:
+    DstRegClass = AMDGPU::VReg_128RegClassID;
+    DstTy = MVT::v4i32;
+    break;
+  case 2:
+    DstRegClass = AMDGPU::VReg_64RegClassID;
+    DstTy = MVT::v2i32;
+    break;
+  default:
+    llvm_unreachable("unhandled Reg sequence size");
+  }
+
+  SmallVector<SDValue, 17> Ops;
+  Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
+  for (unsigned i = 0; i < Elts.size(); ++i) {
+    Ops.push_back(Elts[i]);
+    Ops.push_back(CurDAG->getTargetConstant(
+        SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
+  }
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
+}
+
+static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
+                                         llvm::SelectionDAG *CurDAG,
+                                         const SDLoc &DL) {
+  SmallVector<SDValue, 8> PackedElts;
+  assert("unhandled Reg sequence size" &&
+         (Elts.size() == 8 || Elts.size() == 16));
+
+  // Pack 16-bit elements in pairs into 32-bit register. If both elements are
+  // unpacked from 32-bit source use it, otherwise pack them using v_perm.
+  for (unsigned i = 0; i < Elts.size(); i += 2) {
+    SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
+    SDValue HiSrc;
+    if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
+      PackedElts.push_back(HiSrc);
+    } else {
+      SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
+      MachineSDNode *Packed =
+          CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
+                                 {Elts[i + 1], Elts[i], PackLoLo});
+      PackedElts.push_back(SDValue(Packed, 0));
+    }
+  }
+
+  return buildRegSequence32(PackedElts, CurDAG, DL);
+}
+
+static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
+                                       llvm::SelectionDAG *CurDAG,
+                                       const SDLoc &DL, unsigned ElementSize) {
+  if (ElementSize == 16)
+    return buildRegSequence16(Elts, CurDAG, DL);
+  if (ElementSize == 32)
+    return buildRegSequence32(Elts, CurDAG, DL);
+  llvm_unreachable("Unhandled element size");
+}
+
+static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
+                                 SmallVectorImpl<SDValue> &Elts, SDValue &Src,
+                                 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
+                                 unsigned ElementSize) {
+  if (ModOpcode == ISD::FNEG) {
+    Mods |= SISrcMods::NEG;
+    // Check if all elements also have abs modifier
+    SmallVector<SDValue, 8> NegAbsElts;
+    for (auto El : Elts) {
+      if (El.getOpcode() != ISD::FABS)
+        break;
+      NegAbsElts.push_back(El->getOperand(0));
+    }
+    if (Elts.size() != NegAbsElts.size()) {
+      // Neg
+      Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
+    } else {
+      // Neg and Abs
+      Mods |= SISrcMods::NEG_HI;
+      Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
+    }
+  } else {
+    assert(ModOpcode == ISD::FABS);
+    // Abs
+    Mods |= SISrcMods::NEG_HI;
+    Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
+  }
+}
+
+// Check all f16 elements for modifiers while looking through b32 and v2b16
+// build vector, stop if element does not satisfy ModifierCheck.
+static void
+checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
+                              std::function<bool(SDValue)> ModifierCheck) {
+  for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+    if (auto *F16Pair =
+            dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
+      for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
+        SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
+        if (!ModifierCheck(ElF16))
+          break;
+      }
+    }
+  }
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
+                                              SDValue &SrcMods) const {
+  Src = In;
+  unsigned Mods = SISrcMods::OP_SEL_1;
+
+  // mods are on f16 elements
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    SmallVector<SDValue, 8> EltsF16;
+
+    checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
+      if (Element.getOpcode() != ISD::FNEG)
+        return false;
+      EltsF16.push_back(Element.getOperand(0));
+      return true;
+    });
+
+    // All elements have neg modifier
+    if (BV->getNumOperands() * 2 == EltsF16.size()) {
+      Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
+      Mods |= SISrcMods::NEG;
+      Mods |= SISrcMods::NEG_HI;
+    }
+  }
+
+  // mods are on v2f16 elements
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    SmallVector<SDValue, 8> EltsV2F16;
+    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+      SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (ElV2f16.getOpcode() != ISD::FNEG)
+        break;
+      EltsV2F16.push_back(ElV2f16.getOperand(0));
+    }
+
+    // All pairs of elements have neg modifier
+    if (BV->getNumOperands() == EltsV2F16.size()) {
+      Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
+      Mods |= SISrcMods::NEG;
+      Mods |= SISrcMods::NEG_HI;
+    }
+  }
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
+                                                 SDValue &SrcMods) const {
+  Src = In;
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned ModOpcode;
+
+  // mods are on f16 elements
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    SmallVector<SDValue, 8> EltsF16;
+    checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsF16.empty())
+        ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+      if (ElF16.getOpcode() != ModOpcode)
+        return false;
+      EltsF16.push_back(ElF16.getOperand(0));
+      return true;
+    });
+
+    // All elements have ModOpcode modifier
+    if (BV->getNumOperands() * 2 == EltsF16.size())
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
+                           16);
+  }
+
+  // mods are on v2f16 elements
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    SmallVector<SDValue, 8> EltsV2F16;
+
+    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+      SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsV2F16.empty())
+        ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+      if (ElV2f16->getOpcode() != ModOpcode)
+        break;
+      EltsV2F16.push_back(ElV2f16->getOperand(0));
+    }
+
+    // All elements have ModOpcode modifier
+    if (BV->getNumOperands() == EltsV2F16.size())
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
+                           32);
+  }
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
+                                                 SDValue &SrcMods) const {
+  Src = In;
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned ModOpcode;
+  SmallVector<SDValue, 8> EltsF32;
+
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+      SDValue ElF32 = stripBitcast(BV->getOperand(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsF32.empty())
+        ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+      if (ElF32.getOpcode() != ModOpcode)
+        break;
+      EltsF32.push_back(ElF32.getOperand(0));
+    }
+
+    // All elements had ModOpcode modifier
+    if (BV->getNumOperands() == EltsF32.size())
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
+                           32);
+  }
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
+    BitVector UndefElements;
+    if (SDValue Splat = BV->getSplatValue(&UndefElements))
+      if (isInlineImmediate(Splat.getNode())) {
+        if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
+          unsigned Imm = C->getAPIntValue().getSExtValue();
+          Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+          return true;
+        }
+        if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
+          unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
+          Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+          return true;
+        }
+        llvm_unreachable("unhandled Constant node");
+      }
+  }
+
+  // 16 bit splat
+  SDValue SplatSrc32 = stripBitcast(In);
+  if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32)) {
+    if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
+      SDValue SplatSrc16 = stripBitcast(Splat32);
+      if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16)) {
+        if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
+
+          // f16
+          if (isInlineImmediate(Splat.getNode())) {
+            const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat);
+            int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
+            Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16);
+            return true;
+          }
+
+          // bf16
+          if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
+            const SIInstrInfo *TII = Subtarget->getInstrInfo();
+            APInt BF16Value = C->getAPIntValue();
+            APInt F32Value = BF16Value.zext(32).shl(16);
+            if (TII->isInlineConstant(F32Value)) {
+              int64_t Imm = F32Value.getSExtValue();
+              Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
+                                            SDValue &IndexKey) const {
+  unsigned Key = 0;
+  Src = In;
+
+  if (In.getOpcode() == ISD::SRL) {
+    const llvm::SDValue &ShiftSrc = In.getOperand(0);
+    ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+    if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
+        ShiftAmt->getZExtValue() % 8 == 0) {
+      Key = ShiftAmt->getZExtValue() / 8;
+      Src = ShiftSrc;
+    }
+  }
+
+  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
+                                             SDValue &IndexKey) const {
+  unsigned Key = 0;
+  Src = In;
+
+  if (In.getOpcode() == ISD::SRL) {
+    const llvm::SDValue &ShiftSrc = In.getOperand(0);
+    ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+    if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
+        ShiftAmt->getZExtValue() == 16) {
+      Key = 1;
+      Src = ShiftSrc;
+    }
+  }
+
+  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
   Src = In;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 8645490f0b16f1..3b42d88df0c246 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -240,6 +240,16 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
   bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
 
+  bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
+                               SDValue &SrcMods) const;
+  bool SelectWMMAModsF16Neg(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
+                               SDValue &SrcMods) const;
+  bool SelectWMMAVISrc(SDValue In, SDValue &Src) const;
+
+  bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+  bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+
   bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
   bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 55d95154c75878..2af53a664ff173 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -577,6 +577,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                        ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
 
   setMaxAtomicSizeInBitsSupported(64);
+  setMaxDivRemBitWidthSupported(64);
 }
 
 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index fdee74d58d2691..f255d098b631c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3956,6 +3956,219 @@ AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
   }};
 }
 
+static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
+                                 MachineInstr *InsertPt,
+                                 MachineRegisterInfo &MRI) {
+  const TargetRegisterClass *DstRegClass;
+  switch (Elts.size()) {
+  case 8:
+    DstRegClass = &AMDGPU::VReg_256RegClass;
+    break;
+  case 4:
+    DstRegClass = &AMDGPU::VReg_128RegClass;
+    break;
+  case 2:
+    DstRegClass = &AMDGPU::VReg_64RegClass;
+    break;
+  default:
+    llvm_unreachable("unhandled Reg sequence size");
+  }
+
+  MachineIRBuilder B(*InsertPt);
+  auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
+                 .addDef(MRI.createVirtualRegister(DstRegClass));
+  for (unsigned i = 0; i < Elts.size(); ++i) {
+    MIB.addReg(Elts[i]);
+    MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
+  }
+  return MIB->getOperand(0).getReg();
+}
+
+static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
+                                 SmallVectorImpl<Register> &Elts, Register &Src,
+                                 MachineInstr *InsertPt,
+                                 MachineRegisterInfo &MRI) {
+  if (ModOpcode == TargetOpcode::G_FNEG) {
+    Mods |= SISrcMods::NEG;
+    // Check if all elements also have abs modifier
+    SmallVector<Register, 8> NegAbsElts;
+    for (auto El : Elts) {
+      Register FabsSrc;
+      if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
+        break;
+      NegAbsElts.push_back(FabsSrc);
+    }
+    if (Elts.size() != NegAbsElts.size()) {
+      // Neg
+      Src = buildRegSequence(Elts, InsertPt, MRI);
+    } else {
+      // Neg and Abs
+      Mods |= SISrcMods::NEG_HI;
+      Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
+    }
+  } else {
+    assert(ModOpcode == TargetOpcode::G_FABS);
+    // Abs
+    Mods |= SISrcMods::NEG_HI;
+    Src = buildRegSequence(Elts, InsertPt, MRI);
+  }
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
+  Register Src = Root.getReg();
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned ModOpcode;
+  SmallVector<Register, 8> EltsF32;
+
+  if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
+    for (unsigned i = 0; i < BV->getNumSources(); ++i) {
+      MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsF32.empty())
+        ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
+                                                           : AMDGPU::G_FABS;
+      if (ElF32->getOpcode() != ModOpcode)
+        break;
+      EltsF32.push_back(ElF32->getOperand(1).getReg());
+    }
+
+    // All elements had ModOpcode modifier
+    if (BV->getNumSources() == EltsF32.size()) {
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
+                           *MRI);
+    }
+  }
+
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
+  Register Src = Root.getReg();
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  SmallVector<Register, 8> EltsV2F16;
+
+  if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
+    for (unsigned i = 0; i < CV->getNumSources(); ++i) {
+      Register FNegSrc;
+      if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
+        break;
+      EltsV2F16.push_back(FNegSrc);
+    }
+
+    // All elements had ModOpcode modifier
+    if (CV->getNumSources() == EltsV2F16.size()) {
+      Mods |= SISrcMods::NEG;
+      Mods |= SISrcMods::NEG_HI;
+      Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
+    }
+  }
+
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
+  Register Src = Root.getReg();
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned ModOpcode;
+  SmallVector<Register, 8> EltsV2F16;
+
+  if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
+    for (unsigned i = 0; i < CV->getNumSources(); ++i) {
+      MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsV2F16.empty())
+        ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
+                                                             : AMDGPU::G_FABS;
+      if (ElV2F16->getOpcode() != ModOpcode)
+        break;
+      EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
+    }
+
+    // All elements had ModOpcode modifier
+    if (CV->getNumSources() == EltsV2F16.size()) {
+      MachineIRBuilder B(*Root.getParent());
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
+                           *MRI);
+    }
+  }
+
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
+  std::optional<FPValueAndVReg> FPValReg;
+  if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
+    if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) {
+      return {{[=](MachineInstrBuilder &MIB) {
+        MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
+      }}};
+    }
+    // Non-inlineable splat floats should not fall-through for integer immediate
+    // checks.
+    return {};
+  }
+
+  APInt ICst;
+  if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
+    if (TII.isInlineConstant(ICst)) {
+      return {
+          {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
+    }
+  }
+
+  return {};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
+  Register Src =
+      getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+  unsigned Key = 0;
+
+  Register ShiftSrc;
+  std::optional<ValueAndVReg> ShiftAmt;
+  if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
+      MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
+      ShiftAmt->Value.getZExtValue() % 8 == 0) {
+    Key = ShiftAmt->Value.getZExtValue() / 8;
+    Src = ShiftSrc;
+  }
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
+
+  Register Src =
+      getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+  unsigned Key = 0;
+
+  Register ShiftSrc;
+  std::optional<ValueAndVReg> ShiftAmt;
+  if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
+      MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
+      ShiftAmt->Value.getZExtValue() == 16) {
+    Src = ShiftSrc;
+    Key = 1;
+  }
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
   Register Src;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 12ea46c2895b04..ef7630f137aca6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -199,6 +199,19 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectWMMAModsF32NegAbs(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectWMMAModsF16Neg(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectWMMAModsF16NegAbs(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectWMMAVISrc(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectSWMMACIndex8(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectSWMMACIndex16(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectVOP3OpSelMods(MachineOperand &Root) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8e74d4c0e94592..17ffb7ec988f0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4178,10 +4178,45 @@ bool AMDGPULegalizerInfo::loadInputValue(
     Register DstReg, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
-  const ArgDescriptor *Arg;
+  const ArgDescriptor *Arg = nullptr;
   const TargetRegisterClass *ArgRC;
   LLT ArgTy;
-  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
+
+  CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
+  const ArgDescriptor WorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP9);
+  // If GridZ is not programmed in an entry function then the hardware will set
+  // it to all zeros, so there is no need to mask the GridY value in the low
+  // order bits.
+  const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
+      AMDGPU::TTMP7,
+      AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
+  const ArgDescriptor WorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
+    switch (ArgType) {
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+      Arg = &WorkGroupIDX;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+      Arg = &WorkGroupIDY;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+      Arg = &WorkGroupIDZ;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Arg)
+    std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
 
   if (!Arg) {
     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
@@ -6848,6 +6883,21 @@ bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
+                                         MachineIRBuilder &B) const {
+  // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
+  if (!ST.hasArchitectedSGPRs())
+    return false;
+  LLT S32 = LLT::scalar(32);
+  Register DstReg = MI.getOperand(0).getReg();
+  auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
+  auto LSB = B.buildConstant(S32, 25);
+  auto Width = B.buildConstant(S32, 5);
+  B.buildUbfx(DstReg, TTMP8, LSB, Width);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                             MachineInstr &MI) const {
   MachineIRBuilder &B = Helper.MIRBuilder;
@@ -6970,6 +7020,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_workgroup_id_z:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_wave_id:
+    return legalizeWaveID(MI, B);
   case Intrinsic::amdgcn_lds_kernel_id:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
@@ -7134,6 +7186,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
   case Intrinsic::amdgcn_image_bvh_intersect_ray:
     return legalizeBVHIntrinsic(MI, B);
+  case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+  case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
+    Register Index = MI.getOperand(5).getReg();
+    LLT S32 = LLT::scalar(32);
+    if (MRI.getType(Index) != S32)
+      MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
+    return true;
+  }
+  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
+    Register Index = MI.getOperand(7).getReg();
+    LLT S32 = LLT::scalar(32);
+    if (MRI.getType(Index) != S32)
+      MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
+    return true;
+  }
   case Intrinsic::amdgcn_fmed3: {
     GISelChangeObserver &Observer = Helper.Observer;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 56aabd4f6ab71b..ecbe42681c6690 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -212,6 +212,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
+  bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
 
   bool legalizeImageIntrinsic(
       MachineInstr &MI, MachineIRBuilder &B,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 5e73411cae9b70..c1b244f50d93f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -521,10 +521,18 @@ static Value *promoteAllocaUserToVector(
       // For memset, we don't need to know the previous value because we
       // currently only allow memsets that cover the whole alloca.
       Value *Elt = MSI->getOperand(1);
-      if (DL.getTypeStoreSize(VecEltTy) > 1) {
-        Value *EltBytes =
-            Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
-        Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
+      const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
+      if (BytesPerElt > 1) {
+        Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
+
+        // If the element type of the vector is a pointer, we need to first cast
+        // to an integer, then use a PtrCast.
+        if (VecEltTy->isPointerTy()) {
+          Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
+          Elt = Builder.CreateBitCast(EltBytes, PtrInt);
+          Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
+        } else
+          Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
       }
 
       return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bdd4e891f15899..09fac963d222d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4505,6 +4505,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+    case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+    case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
       return getDefaultMappingVOP(MI);
     case Intrinsic::amdgcn_log:
     case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 67263f23b98314..bb1c6b73372999 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -414,6 +414,22 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
 def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
 def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
 def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
 def : SourceOfDivergence<int_amdgcn_global_load_tr>;
 
 // The dummy boolean output is divergent from the IR's perspective,
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 489cf85693edb2..9ab657f4e7bb4f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -151,6 +151,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     ImmTyOpSelHi,
     ImmTyNegLo,
     ImmTyNegHi,
+    ImmTyIndexKey8bit,
+    ImmTyIndexKey16bit,
     ImmTyDPP8,
     ImmTyDppCtrl,
     ImmTyDppRowMask,
@@ -383,6 +385,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   bool isGDS() const { return isImmTy(ImmTyGDS); }
   bool isLDS() const { return isImmTy(ImmTyLDS); }
   bool isCPol() const { return isImmTy(ImmTyCPol); }
+  bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
+  bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
   bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
   bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); }
@@ -656,6 +660,14 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isVISrcF16() || isVISrcB32();
   }
 
+  bool isVISrc_64F16() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16);
+  }
+
+  bool isVISrc_64B32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
+  }
+
   bool isVISrc_64B64() const {
     return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64);
   }
@@ -672,6 +684,14 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
   }
 
+  bool isVISrc_256B32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32);
+  }
+
+  bool isVISrc_256F32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32);
+  }
+
   bool isVISrc_256B64() const {
     return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64);
   }
@@ -1047,6 +1067,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     case ImmTyOffset1: OS << "Offset1"; break;
     case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break;
     case ImmTyCPol: OS << "CPol"; break;
+    case ImmTyIndexKey8bit: OS << "index_key"; break;
+    case ImmTyIndexKey16bit: OS << "index_key"; break;
     case ImmTyTFE: OS << "TFE"; break;
     case ImmTyD16: OS << "D16"; break;
     case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -1604,6 +1626,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   ParseStatus parseRegWithFPInputMods(OperandVector &Operands);
   ParseStatus parseRegWithIntInputMods(OperandVector &Operands);
   ParseStatus parseVReg32OrOff(OperandVector &Operands);
+  ParseStatus tryParseIndexKey(OperandVector &Operands,
+                               AMDGPUOperand::ImmTy ImmTy);
+  ParseStatus parseIndexKey8bit(OperandVector &Operands);
+  ParseStatus parseIndexKey16bit(OperandVector &Operands);
+
   ParseStatus parseDfmtNfmt(int64_t &Format);
   ParseStatus parseUfmt(int64_t &Format);
   ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc,
@@ -1784,6 +1811,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
+  void cvtSWMMAC(MCInst &Inst, const OperandVector &Operands);
+
   void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
                     OptionalImmIndexMap &OptionalIdx);
@@ -3500,6 +3529,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
     return !isInlineConstant(Inst, OpIdx);
   } else if (MO.isReg()) {
     auto Reg = MO.getReg();
+    if (!Reg) {
+      return false;
+    }
     const MCRegisterInfo *TRI = getContext().getRegisterInfo();
     auto PReg = mc2PseudoReg(Reg);
     return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
@@ -4364,7 +4396,11 @@ bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) {
   uint64_t TSFlags = MII.get(Opc).TSFlags;
 
   // v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2)
-  if (!(TSFlags & SIInstrFlags::IsDOT))
+  // v_wmma iu4/iu8 neg_lo not allowed on src2 (allowed on src0, src1)
+  // v_swmmac f16/bf16 neg_lo/neg_hi not allowed on src2 (allowed on src0, src1)
+  // other wmma/swmmac instructions don't have neg_lo/neg_hi operand.
+  if (!(TSFlags & SIInstrFlags::IsDOT) && !(TSFlags & SIInstrFlags::IsWMMA) &&
+      !(TSFlags & SIInstrFlags::IsSWMMAC))
     return true;
 
   int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
@@ -6465,6 +6501,33 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
   return true;
 }
 
+ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands,
+                                              AMDGPUOperand::ImmTy ImmTy) {
+  const char *Pref = "index_key";
+  int64_t ImmVal = 0;
+  SMLoc Loc = getLoc();
+  auto Res = parseIntWithPrefix(Pref, ImmVal);
+  if (!Res.isSuccess())
+    return Res;
+
+  if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1))
+    return Error(Loc, Twine("out of range ", StringRef(Pref)));
+
+  if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3))
+    return Error(Loc, Twine("out of range ", StringRef(Pref)));
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, ImmTy));
+  return ParseStatus::Success;
+}
+
+ParseStatus AMDGPUAsmParser::parseIndexKey8bit(OperandVector &Operands) {
+  return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey8bit);
+}
+
+ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) {
+  return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit);
+}
+
 // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
 // values to live in a joint format operand in the MCInst encoding.
 ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -8303,12 +8366,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
   const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
 
   if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
-      Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
+      Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
+      Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
+      Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
     Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
     Inst.addOperand(Inst.getOperand(0));
   }
 
-  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) {
+  // Adding vdst_in operand is already covered for these DPP instructions in
+  // cvtVOP3DPP.
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
+      !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
     assert(!IsPacked);
     Inst.addOperand(Inst.getOperand(0));
   }
@@ -8329,10 +8400,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
   }
 
   int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
-  if (NegLoIdx != -1) {
+  if (NegLoIdx != -1)
     addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
+
+  int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+  if (NegHiIdx != -1)
     addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
-  }
 
   const int Ops[] = { AMDGPU::OpName::src0,
                       AMDGPU::OpName::src1,
@@ -8352,11 +8425,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
   if (OpSelHiIdx != -1)
     OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
 
-  if (NegLoIdx != -1) {
-    int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+  if (NegLoIdx != -1)
     NegLo = Inst.getOperand(NegLoIdx).getImm();
+
+  if (NegHiIdx != -1)
     NegHi = Inst.getOperand(NegHiIdx).getImm();
-  }
 
   for (int J = 0; J < 3; ++J) {
     int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
@@ -8392,6 +8465,43 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
   cvtVOP3P(Inst, Operands, OptIdx);
 }
 
+static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands,
+                                  unsigned i, unsigned Opc, unsigned OpName) {
+  if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1)
+    ((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2);
+  else
+    ((AMDGPUOperand &)*Operands[i]).addRegOperands(Inst, 1);
+}
+
+void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) {
+  unsigned Opc = Inst.getOpcode();
+
+  ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
+  addSrcModifiersAndSrc(Inst, Operands, 2, Opc, AMDGPU::OpName::src0_modifiers);
+  addSrcModifiersAndSrc(Inst, Operands, 3, Opc, AMDGPU::OpName::src1_modifiers);
+  ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); // srcTiedDef
+  ((AMDGPUOperand &)*Operands[4]).addRegOperands(Inst, 1); // src2
+
+  OptionalImmIndexMap OptIdx;
+  for (unsigned i = 5; i < Operands.size(); ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+    OptIdx[Op.getImmTy()] = i;
+  }
+
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_8bit))
+    addOptionalImmOperand(Inst, Operands, OptIdx,
+                          AMDGPUOperand::ImmTyIndexKey8bit);
+
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_16bit))
+    addOptionalImmOperand(Inst, Operands, OptIdx,
+                          AMDGPUOperand::ImmTyIndexKey16bit);
+
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI);
+
+  cvtVOP3P(Inst, Operands, OptIdx);
+}
+
 //===----------------------------------------------------------------------===//
 // VOPD
 //===----------------------------------------------------------------------===//
@@ -8770,6 +8880,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
       }
     }
 
+    int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
+    if (VdstInIdx == static_cast<int>(Inst.getNumOperands())) {
+      Inst.addOperand(Inst.getOperand(0));
+    }
+
+    bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+                          Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
+                          Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
+                          Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
+    if (IsVOP3CvtSrDpp) {
+      if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
+        Inst.addOperand(MCOperand::createImm(0));
+        Inst.addOperand(MCOperand::createReg(0));
+      }
+    }
+
     auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
                                             MCOI::TIED_TO);
     if (TiedTo != -1) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 86096b0d80b424..a9968cfe25b46b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -260,8 +260,12 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
 
@@ -704,6 +708,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       break;
 
     Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS);
+    if (Res)
+      break;
+
+    Res = tryDecodeInst(DecoderTableWMMAGFX1264, MI, QW, Address, CS);
   } while (false);
 
   if (Res && AMDGPU::isMAC(MI.getOpcode())) {
@@ -712,6 +720,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                          AMDGPU::OpName::src2_modifiers);
   }
 
+  if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
+              MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
+    // Insert dummy unused src2_modifiers.
+    insertNamedMCOperand(MI, MCOperand::createImm(0),
+                         AMDGPU::OpName::src2_modifiers);
+  }
+
   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
       !AMDGPU::hasGDS(STI)) {
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
@@ -942,6 +957,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
 // first add optional MI operands to check FI
 DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
+
   if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
     convertVOP3PDPPInst(MI);
   } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
@@ -951,6 +967,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
     if (isMacDPP(MI))
       convertMacDPPInst(MI);
 
+    int VDstInIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+    if (VDstInIdx != -1)
+      insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+
+    if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+        MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
+      insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+
     unsigned DescNumOps = MCII->get(Opc).getNumOperands();
     if (MI.getNumOperands() < DescNumOps &&
         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -977,6 +1002,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   if (isMacDPP(MI))
     convertMacDPPInst(MI);
 
+  int VDstInIdx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+  if (VDstInIdx != -1)
+    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+
+  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
+      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
+    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
   if (MI.getNumOperands() < DescNumOps &&
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index b6e4e65ff5b03b..08bef7ad3002c6 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1716,14 +1716,14 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
 }
 
 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
-  if (!SIInstrInfo::isWMMA(*MI))
+  if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
     return false;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
-  auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
-    if (!SIInstrInfo::isWMMA(I))
+  auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
+    if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
       return false;
 
     // Src0 or Src1 of the current wmma instruction overlaps with the dest of
@@ -1753,6 +1753,7 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
       const MachineOperand *Src2Mods =
           TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
       const bool NoSrc2Mods =
+          !Src2Mods ||
           (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
       // Exception: there is no hazard if the wmma instructions are of the same
       // type and there is no input modifier on src2 of the current instruction.
@@ -1760,6 +1761,18 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
                               TII->pseudoToMCOpcode(MI->getOpcode())));
     }
 
+    // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
+    // but Index can't overlap with PrevDstReg.
+    if (AMDGPU::isGFX12Plus(ST)) {
+      if (SIInstrInfo::isSWMMAC(*MI)) {
+        const Register CurIndex =
+            TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+        if (TRI->regsOverlap(PrevDstReg, CurIndex))
+          return true;
+      }
+      return false;
+    }
+
     return false;
   };
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index e73e53aa270f91..abfa4a3531e8e1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1275,6 +1275,23 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
         (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
   }
 
+  // Print three values of neg/opsel for wmma instructions (prints 0 when there
+  // is no src_modifier operand instead of not printing anything).
+  if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC ||
+      MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) {
+    NumOps = 0;
+    int DefaultValue = Mod == SISrcMods::OP_SEL_1;
+    for (int OpName :
+         {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
+          AMDGPU::OpName::src2_modifiers}) {
+      int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+      if (Idx != -1)
+        Ops[NumOps++] = MI->getOperand(Idx).getImm();
+      else
+        Ops[NumOps++] = DefaultValue;
+    }
+  }
+
   const bool HasDstSel =
     NumOps > 0 &&
     Mod == SISrcMods::OP_SEL_0 &&
@@ -1305,6 +1322,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
                                    const MCSubtargetInfo &STI,
                                    raw_ostream &O) {
   unsigned Opc = MI->getOpcode();
+  if (isCvt_F32_Fp8_Bf8_e64(Opc)) {
+    auto SrcMod =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+    unsigned Mod = MI->getOperand(SrcMod).getImm();
+    unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0);
+    unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1);
+    if (Index0 || Index1)
+      O << " op_sel:[" << Index0 << ',' << Index1 << ']';
+    return;
+  }
   if (isPermlane16(Opc)) {
     auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
     auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
@@ -1336,6 +1363,26 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo,
   printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O);
 }
 
+void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+  if (Imm == 0)
+    return;
+
+  O << " index_key:" << Imm;
+}
+
+void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+  if (Imm == 0)
+    return;
+
+  O << " index_key:" << Imm;
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index e3958f88277da8..e91ff86b219a0c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -139,6 +139,10 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                   const MCSubtargetInfo &STI, raw_ostream &O);
   void printNegHi(const MCInst *MI, unsigned OpNo,
                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printIndexKey8bit(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printIndexKey16bit(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 8ab66d4fd5b861..19596d53b45328 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -167,6 +167,9 @@ enum : uint64_t {
 
   // ds_gws_* instructions.
   GWS = UINT64_C(1) << 62,
+
+  // Is a SWMMAC instruction.
+  IsSWMMAC = UINT64_C(1) << 63,
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2862a7787e75a3..a812cdc61500cc 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -208,6 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
   assert(Old.isReg() && Fold.isImm());
 
   if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
+      (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
       (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cf947dccafac55..d6bf0d8cb2efa8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2072,11 +2072,45 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
   const SIMachineFunctionInfo &MFI,
   EVT VT,
   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
-  const ArgDescriptor *Reg;
+  const ArgDescriptor *Reg = nullptr;
   const TargetRegisterClass *RC;
   LLT Ty;
 
-  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
+  CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
+  const ArgDescriptor WorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP9);
+  // If GridZ is not programmed in an entry function then the hardware will set
+  // it to all zeros, so there is no need to mask the GridY value in the low
+  // order bits.
+  const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
+      AMDGPU::TTMP7,
+      AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
+  const ArgDescriptor WorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
+    switch (PVID) {
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+      Reg = &WorkGroupIDX;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+      Reg = &WorkGroupIDY;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+      Reg = &WorkGroupIDZ;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Reg)
+    std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
   if (!Reg) {
     if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
       // It's possible for a kernarg intrinsic call to appear in a kernel with
@@ -2505,28 +2539,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
     }
   }
 
-  if (Info.hasWorkGroupIDX()) {
-    Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+  if (!HasArchitectedSGPRs) {
+    if (Info.hasWorkGroupIDX()) {
+      Register Reg = Info.addWorkGroupIDX();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
 
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasWorkGroupIDY()) {
-    Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+    if (Info.hasWorkGroupIDY()) {
+      Register Reg = Info.addWorkGroupIDY();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
 
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasWorkGroupIDZ()) {
-    Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+    if (Info.hasWorkGroupIDZ()) {
+      Register Reg = Info.addWorkGroupIDZ();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
-
-    CCInfo.AllocateReg(Reg);
+      CCInfo.AllocateReg(Reg);
+    }
   }
 
   if (Info.hasWorkGroupInfo()) {
@@ -7890,6 +7920,17 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
   return Loads[0];
 }
 
+SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
+  // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
+  if (!Subtarget->hasArchitectedSGPRs())
+    return {};
+  SDLoc SL(Op);
+  MVT VT = MVT::i32;
+  SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
+  return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
+                     DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
+}
+
 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
                                           unsigned Dim,
                                           const ArgDescriptor &Arg) const {
@@ -8060,6 +8101,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_workgroup_id_z:
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_wave_id:
+    return lowerWaveID(DAG, Op);
   case Intrinsic::amdgcn_lds_kernel_id: {
     if (MFI->isEntryFunction())
       return getLDSKernelId(DAG, DL);
@@ -8242,6 +8285,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                             SIInstrInfo::MO_ABS32_LO);
     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
   }
+  case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+  case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
+    if (Op.getOperand(4).getValueType() == MVT::i32)
+      return SDValue();
+
+    SDLoc SL(Op);
+    auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+                       Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+                       Op.getOperand(3), IndexKeyi32);
+  }
+  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
+    if (Op.getOperand(6).getValueType() == MVT::i32)
+      return SDValue();
+
+    SDLoc SL(Op);
+    auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+                       {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+                        Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+                        IndexKeyi32, Op.getOperand(7)});
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d66ba0b59ba906..e436c23af5bcac 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -80,6 +80,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
                                         unsigned NewOpcode) const;
 
+  SDValue lowerWaveID(SelectionDAG &DAG, SDValue Op) const;
   SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim,
                           const ArgDescriptor &ArgDesc) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 1b66d163714fbc..ab536f8f49d537 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -161,6 +161,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   // ds_gws_* instructions.
   field bit GWS = 0;
 
+  // This bit indicates that this is one of SWMMAC instructions.
+  field bit IsSWMMAC = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -248,6 +251,8 @@ class InstSI <dag outs, dag ins, string asm = "",
   
   let TSFlags{62} = GWS;
 
+  let TSFlags{63} = IsSWMMAC;
+
   let SchedRW = [Write32Bit];
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fc85b089aa471b..1c9dacc09f8154 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -802,6 +802,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return isMFMA(MI) || isWMMA(MI);
   }
 
+  static bool isSWMMAC(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
+  }
+
+  bool isSWMMAC(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC;
+  }
+
   bool isDOT(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index a6820544f4b4d2..45be81950aa309 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1088,6 +1088,9 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
 def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
 def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
 
+def IndexKey16bit : CustomOperand<i32, 1>;
+def IndexKey8bit : CustomOperand<i32, 1>;
+
 def dpp8 : CustomOperand<i32, 0, "DPP8">;
 def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
 
@@ -1344,6 +1347,13 @@ def VOP3PModsDOT  : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
 def VOP3PModsNeg  : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
 def WMMAOpSelVOP3PMods  : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
 
+def WMMAModsF32NegAbs  : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
+def WMMAModsF16Neg  : ComplexPattern<untyped, 2, "SelectWMMAModsF16Neg">;
+def WMMAModsF16NegAbs  : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">;
+def WMMAVISrc  : ComplexPattern<untyped, 1, "SelectWMMAVISrc">;
+def SWMMACIndex8  : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">;
+def SWMMACIndex16  : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">;
+
 def VOP3OpSel  : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
 
 def VOP3OpSelMods  : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
@@ -1684,8 +1694,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
         !if(HasOMod,
           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                clampmod0:$clamp, omod0:$omod),
-          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-               clampmod0:$clamp))
+          !if (HasClamp,
+            (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp),
+            (ins Src0Mod:$src0_modifiers, Src0RC:$src0)))
       /* else */,
         // VOP1 without modifiers
         !if (HasClamp,
@@ -2278,6 +2289,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit IsDOT = 0;
   field bit IsSingle = 0;
   field bit IsWMMA = 0;
+  field bit IsSWMMAC = 0;
+
+  field bit IsFP8 = 0;
 
   field bit HasDst = !ne(DstVT.Value, untyped.Value);
   field bit HasDst32 = HasDst;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9ff66a094f991f..0336ec4985ea74 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -751,35 +751,21 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   }
 
   // Add system SGPRs.
-  Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
-    ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
-
+  Register addWorkGroupIDX() {
+    ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
+    NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDX.getRegister();
   }
 
-  Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
-    unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
-    ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
-
+  Register addWorkGroupIDY() {
+    ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
+    NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDY.getRegister();
   }
 
-  Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
-    unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
-    ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
-
+  Register addWorkGroupIDZ() {
+    ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
+    NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDZ.getRegister();
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 84b9330ef9633e..50d8bfa8750818 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2358,6 +2358,11 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
 
   bool Changed = false;
 
+  if (IsNonTemporal) {
+    // Set non-temporal hint for all cache levels.
+    Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
+  }
+
   if (IsVolatile) {
     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
 
@@ -2370,11 +2375,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
                           Position::AFTER);
   }
 
-  if (IsNonTemporal) {
-    // Set non-temporal hint for all cache levels.
-    Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
-  }
-
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index f42af89cf5e6d3..b3265b73fa7e11 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1341,9 +1341,14 @@ def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">;
 //  VISrc_* Operands with a VGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
+def VISrc_64_f16 : RegOrF16 <"VReg_64", "OPERAND_REG_INLINE_C">;
+def VISrc_64_b32 : RegOrB32 <"VReg_64", "OPERAND_REG_INLINE_C">;
 def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">;
+def VISrc_128_f16 : RegOrF16 <"VReg_128", "OPERAND_REG_INLINE_C">;
 def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">;
 def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">;
+def VISrc_256_b32 : RegOrB32 <"VReg_256", "OPERAND_REG_INLINE_C">;
+def VISrc_256_f32 : RegOrF32 <"VReg_256", "OPERAND_REG_INLINE_C">;
 def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">;
 def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">;
 def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0bf9452d822e97..106fdb19f27895 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -529,6 +529,17 @@ bool isPermlane16(unsigned Opc) {
          Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12;
 }
 
+bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
+  return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
+         Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
+         Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
+}
+
 bool isGenericAtomic(unsigned Opc) {
   return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
          Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d3f55c79201747..11b0bc5c81711e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -535,6 +535,9 @@ bool isPermlane16(unsigned Opc);
 LLVM_READNONE
 bool isGenericAtomic(unsigned Opc);
 
+LLVM_READNONE
+bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc);
+
 namespace VOPD {
 
 enum Component : unsigned {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 95a1d86963473a..ef652fce65482c 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -571,6 +571,7 @@ let SubtargetPredicate = isGFX9Only in {
 } // End SubtargetPredicate = isGFX9Only
 
 class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> {
+  let HasExtDPP = 1;
   let HasExtSDWA = 1;
   let HasExtSDWA9 = 1;
   let HasExt = 1;
@@ -599,6 +600,7 @@ class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
     (inst_sdwa 0, $src, 0, 0, index)
 >;
 
+let SubtargetPredicate = isGFX9Only in {
 let OtherPredicates = [HasCvtFP8VOP1Bug] in {
   def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
                (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>;
@@ -617,6 +619,7 @@ foreach Index = [1, 2, 3] in {
   def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index, V_CVT_F32_FP8_sdwa>;
   def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index, V_CVT_F32_BF8_sdwa>;
 }
+} // End SubtargetPredicate = isGFX9Only
 
 class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
     VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
@@ -626,11 +629,77 @@ class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
          (inst_e32 $src))
 >;
 
-foreach Index = [0, -1] in {
-  def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
-                          V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
-  def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
-                          V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+let SubtargetPredicate = isGFX9Only in {
+  foreach Index = [0, -1] in {
+    def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
+                            V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
+    def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
+                            V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+  }
+}
+
+
+// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions.
+def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F <v2f32, i32> {
+  let HasOpSel = 1;
+  let HasExtVOP3DPP = 0;
+}
+
+def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> {
+  let HasOpSel = 1;
+  let HasExtDPP = 1;
+  let HasExtVOP3DPP = 1;
+  let IsFP8 = 1;
+  let HasClamp = 0;
+  let HasOMod = 0;
+  let HasModifiers = 1;
+  let Src1VOP3DPP = Src1RC64;
+}
+
+let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0,
+    SchedRW = [WriteFloatCvt] in {
+  defm V_CVT_F32_FP8_OP_SEL    : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
+  defm V_CVT_F32_BF8_OP_SEL    : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
+  defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
+  defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
+}
+
+class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index,
+    VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
+    (f32 (node i32:$src, index)),
+    !if (index,
+         (inst_e64 !if(index{0},
+                         !if(index{1}, !or(SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1),
+                                       SRCMODS.OP_SEL_0),
+                         !if(index{1}, SRCMODS.OP_SEL_1, 0)),
+                    $src, 0),
+         (inst_e32 $src))
+>;
+
+let SubtargetPredicate = isGFX12Plus in {
+  foreach Index = [0, 1, 2, 3] in {
+    def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_fp8, Index,
+                               V_CVT_F32_FP8_e32, V_CVT_F32_FP8_OP_SEL_e64>;
+    def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_bf8, Index,
+                               V_CVT_F32_BF8_e32, V_CVT_F32_BF8_OP_SEL_e64>;
+  }
+}
+
+class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
+    VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
+    (v2f32 (node i32:$src, index)),
+    !if (index,
+         (inst_e64 SRCMODS.OP_SEL_0, $src, 0, 0, SRCMODS.NONE),
+         (inst_e32 $src))
+>;
+
+let SubtargetPredicate = isGFX12Plus in {
+  foreach Index = [0, -1] in {
+    def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_fp8, Index,
+                                  V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_OP_SEL_e64>;
+    def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_bf8, Index,
+                                  V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_OP_SEL_e64>;
+  }
 }
 
 let SubtargetPredicate = isGFX10Plus in {
@@ -853,6 +922,20 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
   VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
 
 
+// Define VOP1 instructions using the pseudo instruction with its old profile and
+// VOP3 using the OpSel profile for the pseudo instruction.
+defm V_CVT_F32_FP8      : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">;
+defm V_CVT_F32_FP8      : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
+
+defm V_CVT_F32_BF8      : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">;
+defm V_CVT_F32_BF8      : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
+
+defm V_CVT_PK_F32_FP8   : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8   : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_OP_SEL", "v_cvt_pk_f32_fp8">;
+
+defm V_CVT_PK_F32_BF8   : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8   : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_OP_SEL", "v_cvt_pk_f32_bf8">;
+
 defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c,
   "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;
 defm V_CVT_FLOOR_I32_F32   : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 713b4712d563c0..14db5221021489 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -520,8 +520,26 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
   let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
                           FP32InputMods:$src1_modifiers, Src1RC64:$src1,
                           VGPR_32:$vdst_in, op_sel0:$op_sel);
+  let InsVOP3DPP = (ins VGPR_32:$old,
+                        FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                        FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                        VGPR_32:$vdst_in, op_sel0:$op_sel,
+                        dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                        bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
+  let InsVOP3DPP16 = (ins VGPR_32:$old,
+                          FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                          FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                          VGPR_32:$vdst_in, op_sel0:$op_sel,
+                          dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                          bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi);
+  let InsVOP3DPP8 = (ins VGPR_32:$old,
+                         FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                         FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                         VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, FI:$fi);
+
   let HasClamp = 0;
-  let HasExtVOP3DPP = 0;
+  let HasExtVOP3DPP = 1;
 }
 
 def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
@@ -530,14 +548,36 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
                           FP32InputMods:$src1_modifiers, Src1RC64:$src1,
                           FP32InputMods:$src2_modifiers, VGPR_32:$src2,
                           op_sel0:$op_sel);
+  let InsVOP3DPP16 = (ins VGPR_32:$old,
+                          FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                          FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                          FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+                          op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                          bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi);
+  let InsVOP3DPP8 = (ins VGPR_32:$old,
+                         FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                         FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                         FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+                         op_sel0:$op_sel, dpp8:$dpp8, FI:$fi);
   let HasClamp = 0;
   let HasSrc2 = 0;
   let HasSrc2Mods = 1;
+  let HasExtVOP3DPP = 1;
+  let HasOpSel = 1;
   let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
                             getAsmVOP3OpSel<3, HasClamp, HasOMod,
                                             HasSrc0FloatMods, HasSrc1FloatMods,
                                             HasSrc2FloatMods>.ret);
-  let HasExtVOP3DPP = 0;
+  let AsmVOP3DPP16 = !subst(", $src2_modifiers", "",
+                            getAsmVOP3DPP16<getAsmVOP3Base<3, 1, HasClamp, 1,
+                                            HasOMod, 0, 1, HasSrc0FloatMods,
+                                            HasSrc1FloatMods,
+                                            HasSrc2FloatMods>.ret>.ret);
+  let AsmVOP3DPP8 = !subst(", $src2_modifiers", "",
+                           getAsmVOP3DPP8<getAsmVOP3Base<3, 1, HasClamp, 1,
+                                          HasOMod, 0, 1, HasSrc0FloatMods,
+                                          HasSrc1FloatMods,
+                                          HasSrc2FloatMods>.ret>.ret);
 }
 
 def IsPow2Plus1: PatLeaf<(i32 imm), [{
@@ -618,13 +658,13 @@ let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0,
 
 class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : GCNPat<
     (i32 (node f32:$src0, f32:$src1, i32:$old, index)),
-    (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0))
+    (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
 >;
 
 class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
     (i32 (node f32:$src0, i32:$src1, i32:$old, index)),
     (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
-          !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0))
+          !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0)
 >;
 
 foreach Index = [0, -1] in {
@@ -998,6 +1038,11 @@ defm V_MAXIMUM_F16        : VOP3Only_Realtriple_t16_gfx12<0x368>;
 defm V_PERMLANE16_VAR_B32  : VOP3Only_Real_Base_gfx12<0x30f>;
 defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
 
+defm V_CVT_PK_FP8_F32  : VOP3Only_Realtriple_gfx12<0x369>;
+defm V_CVT_PK_BF8_F32  : VOP3Only_Realtriple_gfx12<0x36a>;
+defm V_CVT_SR_FP8_F32  : VOP3Only_Realtriple_gfx12<0x36b>;
+defm V_CVT_SR_BF8_F32  : VOP3Only_Realtriple_gfx12<0x36c>;
+
 //===----------------------------------------------------------------------===//
 // GFX11, GFX12
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 0c7a08cd4bc91f..107b95a9ca8eb0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -936,16 +936,19 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
                           !cast<Instruction>(NAME # _threeaddr # Suffix)>;
   }
 
-  if !eq(Type, WMMAOpSel) then {
-    def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
-  } else if !eq(Type, WMMAUIClamp) then {
-    def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
-  } else {
-    def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+  let SubtargetPredicate = isGFX11Only in {
+    if !eq(Type, WMMAOpSel) then {
+      def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+    } else if !eq(Type, WMMAUIClamp) then {
+      def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+    } else {
+      def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+    }
   }
 }
 
 
+
 let WaveSizePredicate = isWave32 in {
   defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
   defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
@@ -969,6 +972,398 @@ let WaveSizePredicate = isWave64 in {
 
 }
 
+class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
+                        bit _IsIU, bit _IsFP8BF8>
+    : VOP3P_Profile<VOPProfile<ArgTy>> {
+  bit IsIU = _IsIU;
+  bit IsFP8BF8 = _IsFP8BF8;
+  bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8));
+
+  int IndexType = _IndexType;
+
+  let IsPacked = 1;
+  let IsWMMA = !not(_IsSWMMAC);
+  let IsSWMMAC = _IsSWMMAC;
+
+  bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP);
+  bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret);
+  bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
+  bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16));
+  bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
+
+  bit NegLo01 = !or(IsF16BF16, IsIU);
+  bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+  bit NegHi01 = IsF16BF16;
+  bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+  bit NegLoAny = !or(NegLo01, NegLo2);
+  bit NegHiAny = !or(NegHi01, NegHi2);
+
+  let DstRC = !cond(!eq(ArgTy[0], v8f32): VDst_256,
+                    !eq(ArgTy[0], v8i32): VDst_256,
+                    !eq(ArgTy[0], v8f16): VDst_128,
+                    !eq(ArgTy[0], v8i16): VDst_128,
+                    !eq(ArgTy[0], v4f32): VDst_128,
+                    !eq(ArgTy[0], v4i32): VDst_128,
+                    !eq(ArgTy[0], v4f16): VDst_64,
+                    !eq(ArgTy[0], v4i16): VDst_64);
+  let Src0RC64 = !cond(!eq(ArgTy[1], v8f16): VRegSrc_128,
+                       !eq(ArgTy[1], v4f16): VRegSrc_64,
+                       !eq(ArgTy[1], v4i16): VRegSrc_64,
+                       !eq(ArgTy[1], v8i16): VRegSrc_128,
+                       !eq(ArgTy[1], v4i32): VRegSrc_128,
+                       !eq(ArgTy[1], v2i32): VRegSrc_64,
+                       !eq(ArgTy[1], i32)  : VRegSrc_32);
+  let Src1RC64 = !cond(!eq(ArgTy[2], v16f16): VRegSrc_256,
+                       !eq(ArgTy[2], v16i16): VRegSrc_256,
+                       !eq(ArgTy[2], v8f16): VRegSrc_128,
+                       !eq(ArgTy[2], v8i16): VRegSrc_128,
+                       !eq(ArgTy[2], v4i32): VRegSrc_128,
+                       !eq(ArgTy[1], v4i16): VRegSrc_64,
+                       !eq(ArgTy[1], v4f16): VRegSrc_64,
+                       !eq(ArgTy[2], v2i32): VRegSrc_64,
+                       !eq(ArgTy[2], i32)  : VRegSrc_32);
+  let Src2RC64 = !if(IsSWMMAC, DstRC,
+                               !cond(!eq(ArgTy[3], v8f32): VISrc_256_f32,
+                                     !eq(ArgTy[3], v8i32): VISrc_256_b32,
+                                     !eq(ArgTy[3], v8f16): VISrc_128_f16,
+                                     !eq(ArgTy[3], v8i16): VISrc_128_f32, // bf16
+                                     !eq(ArgTy[3], v4f16): VISrc_64_f16,
+                                     !eq(ArgTy[3], v4i16): VISrc_64_b32,
+                                     !eq(ArgTy[3], v4i32): VISrc_128_b32,
+                                     !eq(ArgTy[3], v4f32): VISrc_128_f32));
+
+  // For f16 and bf16 matrices A and B, each element can be modified by
+  // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is
+  // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext)
+  // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each
+  // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
+
+  // Opcode             | src0/src1 - matrix A/B | src2 - matrix C or Index
+  // ---------------------------------------------------------------------------
+  // wmma f32_f16       | both neg_lo,neg_hi = 1 | neg_lo = 1  neg C(f32)
+  // wmma f32_bf16      | neg A/B (f16 or bf16)  | neg_hi = 1  abs C(f32)
+  // ---------------------------------------------------------------------------
+  // wmma f16_f16       | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16)
+  // wmma bf16_bf16     | neg A/B (f16 or bf16)  | neg_hi = 1 abs C(f16 or bf16)
+  // ---------------------------------------------------------------------------
+  // wmma i32_iu8/iu4   | neg_lo = 0 u4/u8(zext) | not allowed for
+  //                    | neg_lo = 1 i4/i8(sext) | i32 matrices
+  // ---------------------------------------------------------------------------
+  // wmma f32_fp8/bf8   | not allowed for        | neg_lo = 1  neg C(f32)
+  // (4 instructions)   | f8 and bf8 matrices    | neg_hi = 1  abs C(f32)
+  // ---------------------------------------------------------------------------
+  // swmmac f32_f16     | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
+  // swmmac f32_bf16    | neg A/B (f16 or bf16)  | A Index - matrix C is in dst
+  // ---------------------------------------------------------------------------
+  // swmmac f16_f16     | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
+  // swmmac bf16_bf16   | neg A/B (f16 or bf16)  | A Index - matrix C is in dst
+  // ---------------------------------------------------------------------------
+  // swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix
+  //                    | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst
+  // ---------------------------------------------------------------------------
+  // swmmac f32_fp8/bf8 | not allowed for        | not allowed for sparse matrix
+  // (4 instructions)   | f8 and bf8 matrices    | A Index - matrix C is in dst
+
+  // pseudo
+
+  // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
+  // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
+  // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
+  // f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
+
+  dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers));
+  dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers));
+  dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers));
+  dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
+                       !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
+                       !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit));
+  dag Clamp = !if(IsIU, (ins clampmod0:$clamp), (ins));
+  dag Neg = !cond(!and(NegLoAny, NegHiAny)             : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
+                  !and(NegLoAny, !not(NegHiAny))       : (ins neg_lo0:$neg_lo),
+                  !and(!not(NegLoAny), !not(NegHiAny)) : (ins));
+
+  let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1),
+                      !cond(IsWMMA   : !con(Src2Mods, (ins Src2RC64:$src2)),
+                            IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)),
+                      Clamp, Neg);
+
+  // asm
+
+  string IndexKeyAsm = !cond(!eq(IndexType, 0)  : "",
+                             !eq(IndexType, 8)  : "$index_key_8bit",
+                             !eq(IndexType, 16) : "$index_key_16bit");
+  string ClampAsm = !if(IsIU, "$clamp", "");
+  string NegAsm = !cond(!and(NegLoAny, NegHiAny)             : "$neg_lo$neg_hi",
+                        !and(NegLoAny, !not(NegHiAny))       : "$neg_lo",
+                        !and(!not(NegLoAny), !not(NegHiAny)) : "");
+
+  let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm;
+
+  // isel patterns
+
+  dag Src0InPat  = !cond(IsAB_F16  : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
+                         IsAB_BF16 : (ins Src0VT:$src0),
+                         IsIU      : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+                         IsFP8BF8  : (ins Src0VT:$src0));
+  dag Src0OutPat = !cond(IsAB_F16  : (ins i32:$src0_modifiers, Src0VT:$src0),
+                         IsAB_BF16 : (ins (i32 8), Src0VT:$src0),
+                         IsIU      : (ins i32:$src0_modifiers, Src0VT:$src0),
+                         IsFP8BF8  : (ins Src0VT:$src0));
+  dag Src1InPat  = !cond(IsAB_F16  : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
+                         IsAB_BF16 : (ins Src1VT:$src1),
+                         IsIU      : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+                         IsFP8BF8  : (ins Src1VT:$src1));
+  dag Src1OutPat = !cond(IsAB_F16  : (ins i32:$src1_modifiers, Src1VT:$src1),
+                         IsAB_BF16 : (ins (i32 8), Src1VT:$src1),
+                         IsIU      : (ins i32:$src1_modifiers, Src1VT:$src1),
+                         IsFP8BF8  : (ins Src1VT:$src1));
+  dag Src2InPatWmma  = !cond(IsC_F32  : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+                             IsC_F16  : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+                             IsC_BF16 : (ins Src2VT:$src2),
+                             IsIU     : (ins Src2VT:$src2),
+                             IsSWMMAC : (ins));
+  dag Src2OutPatWmma = !cond(IsC_F32  : (ins i32:$src2_modifiers, Src2VT:$src2),
+                             IsC_F16  : (ins i32:$src2_modifiers, Src2VT:$src2),
+                             IsC_BF16 : (ins (i32 8), Src2VT:$src2),
+                             IsIU     : (ins Src2VT:$src2),
+                             IsSWMMAC : (ins));
+  dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins));
+  dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
+                         !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
+                         !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))));
+  dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
+                          !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
+                          !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit));
+  dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2)));
+  dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2));
+
+
+  dag WmmaInPat  = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat);
+  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat);
+
+  dag SwmmacInPat  = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat);
+  dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat);
+
+  // wmma pattern where src2 is inline imm uses _threeaddr pseudo,
+  // can't use _twoaddr since it would violate src2 tied to vdst constraint.
+  dag WmmaInlineInPat  = !con(Src0InPat, Src1InPat, Src2InlineInPat,  ClampPat);
+  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat);
+}
+
+multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+    let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in
+      def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+        let PseudoInstr = Instr#PseudoInstrSuffix;
+      }
+
+    let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in
+      def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+        let PseudoInstr = Instr#PseudoInstrSuffix;
+      }
+
+  }
+  def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr),
+                          !cast<Instruction>(NAME # _threeaddr)>;
+}
+
+multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+  def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+    let Mnemonic = Instr;
+    let PseudoInstr = Instr#PseudoInstrSuffix;
+    let mayRaiseFPException = 0;
+    let ReadsModeReg = 0;
+    let AsmMatchConverter = "cvtSWMMAC";
+
+    let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
+  }
+}
+
+// First argument in Profile is types for matrices D, A, B and C (D = A * B + C)
+// as used by llvm ir, types are vectors(with matrix elements)
+// wave32:
+// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts,
+// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts.
+// wave64:
+// lanes will have half the size of elements in lanes compared to wave32 with
+// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored
+
+// general idea on element distribution differences:
+// wave32: lane n has 8 matrix elements
+// wave64: lane n has first 4, lane n+32 has other 4 elements
+
+// index size, for each 2 elements in lane you need 4bits in index
+
+// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s.
+// Original type for them is in comment on the right and refers to A and B.
+
+def F32_F16_WMMA_w32    : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>;
+def F32_BF16_WMMA_w32   : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>;
+def F16_F16_WMMA_w32    : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>;
+def BF16_BF16_WMMA_w32  : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>;
+def I32_IU8_WMMA_w32    : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8
+def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32,   i32,   i32, v8i32], 0, 0, 1, 0>; // 8xi4
+def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8
+def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4
+
+def F32_F16_WMMA_w64    : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>;
+def F32_BF16_WMMA_w64   : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>;
+def F16_F16_WMMA_w64    : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>;
+def BF16_BF16_WMMA_w64  : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>;
+def I32_IU8_WMMA_w64    : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 0, 0, 1, 0>; // 4xi8
+def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 0, 0, 1, 0>; // 8xi4 *
+def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32,   i32,   i32, v4f32], 0, 0, 0, 1>; // 4xf8
+def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 0, 0, 1, 0>; // 8xi4
+
+def F32_F16_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>;
+def F32_BF16_SWMMAC_w32   : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>;
+def F16_F16_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>;
+def BF16_BF16_SWMMAC_w32  : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>;
+def I32_IU8_SWMMAC_w32    : VOP3PWMMA_Profile<[v8i32, v2i32,  v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8
+def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32,   i32,  v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4
+def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32,  v4i32, v8i32], 1,  0, 1, 0>; // 16xi4, 32xi4 **
+def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32,  v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8
+
+def F32_F16_SWMMAC_w64    : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1,  8, 0, 0>;
+def F32_BF16_SWMMAC_w64   : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1,  8, 0, 0>;
+def F16_F16_SWMMAC_w64    : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1,  8, 0, 0>;
+def BF16_BF16_SWMMAC_w64  : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1,  8, 0, 0>;
+def I32_IU8_SWMMAC_w64    : VOP3PWMMA_Profile<[v4i32,   i32, v2i32, v4i32], 1,  8, 1, 0>; // 4xi8, 8xi8
+def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 ***
+def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32,   i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4
+def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32,   i32, v2i32, v4f32], 1,  8, 0, 1>; // 4xf8, 8xf8
+
+// *   IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored
+// **  IU4X64_SWMMAC_w32 index is i32, index_key is not used
+// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
+//                       for matrix A, index is i16; Matrix B uses all lanes
+
+let WaveSizePredicate = isWave32 in {
+defm V_WMMA_F32_16X16X16_F16_w32     : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16",     F32_F16_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF16_w32    : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16",    F32_BF16_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X16_F16_w32     : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16",     F16_F16_WMMA_w32, "_w32">;
+defm V_WMMA_BF16_16X16X16_BF16_w32   : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16",   BF16_BF16_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X16_IU8_w32     : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8",     I32_IU8_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X16_IU4_w32     : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4",     I32_IU4X16_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X32_IU4_w32     : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4",     I32_IU4X32_WMMA_w32, "_w32">;
+
+defm V_SWMMAC_F32_16X16X32_F16_w32     : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16",     F32_F16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF16_w32    : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16",    F32_BF16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X32_F16_w32     : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16",     F16_F16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32   : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16",   BF16_BF16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X32_IU8_w32     : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8",     I32_IU8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X32_IU4_w32     : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4",     I32_IU4X32_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X64_IU4_w32     : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4",     I32_IU4X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+}
+
+let WaveSizePredicate = isWave64 in {
+defm V_WMMA_F32_16X16X16_F16_w64     : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16",     F32_F16_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF16_w64    : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16",    F32_BF16_WMMA_w64, "_w64">;
+defm V_WMMA_F16_16X16X16_F16_w64     : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16",     F16_F16_WMMA_w64, "_w64">;
+defm V_WMMA_BF16_16X16X16_BF16_w64   : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16",   BF16_BF16_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X16_IU8_w64     : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8",     I32_IU8_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X16_IU4_w64     : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4",     I32_IU4X16_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X32_IU4_w64     : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4",     I32_IU4X32_WMMA_w64, "_w64">;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64     : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16",     F32_F16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF16_w64    : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16",    F32_BF16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F16_16X16X32_F16_w64     : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16",     F16_F16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64   : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16",   BF16_BF16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X32_IU8_w64     : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8",     I32_IU8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X32_IU4_w64     : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4",     I32_IU4X32_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X64_IU4_w64     : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4",     I32_IU4X64_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+}
+
+// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that
+// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0).
+multiclass WMMAPat<string Inst, SDPatternOperator node, VOP3PWMMA_Profile P, bit IsGFX11OpselIntrinsic = 0> {
+  def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
+                (P.DstVT !setdagop(P.WmmaOutPat, !cast<Instruction>(Inst#"_twoaddr")))>;
+  let AddedComplexity = 4 in
+  def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
+                (P.DstVT !setdagop(P.WmmaInlineOutPat, !cast<Instruction>(Inst#"_threeaddr")))>;
+}
+
+class SWMMACPat<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
+  GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
+          (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>;
+
+class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
+  GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
+          (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{
+            let WaveSizePredicate = isWave64;
+          }
+
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32",     int_amdgcn_wmma_f32_16x16x16_f16,     F32_F16_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32",    int_amdgcn_wmma_f32_16x16x16_bf16,    F32_BF16_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32",     int_amdgcn_wmma_f16_16x16x16_f16,     F16_F16_WMMA_w32,1>;
+  defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32",   int_amdgcn_wmma_bf16_16x16x16_bf16,   BF16_BF16_WMMA_w32,1>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32",     int_amdgcn_wmma_i32_16x16x16_iu8,     I32_IU8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32",     int_amdgcn_wmma_i32_16x16x16_iu4,     I32_IU4X16_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32",     int_amdgcn_wmma_i32_16x16x32_iu4,     I32_IU4X32_WMMA_w32>;
+
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w32_twoaddr,     int_amdgcn_swmmac_f32_16x16x32_f16,     F32_F16_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr,    int_amdgcn_swmmac_f32_16x16x32_bf16,    F32_BF16_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w32_twoaddr,     int_amdgcn_swmmac_f16_16x16x32_f16,     F16_F16_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr,   int_amdgcn_swmmac_bf16_16x16x32_bf16,   BF16_BF16_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr,     int_amdgcn_swmmac_i32_16x16x32_iu8,     I32_IU8_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr,     int_amdgcn_swmmac_i32_16x16x32_iu4,     I32_IU4X32_SWMMAC_w32>;
+  def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat,  int_amdgcn_swmmac_i32_16x16x64_iu4)),
+                (I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
+}
+
+let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64",     int_amdgcn_wmma_f32_16x16x16_f16,     F32_F16_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64",    int_amdgcn_wmma_f32_16x16x16_bf16,    F32_BF16_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64",     int_amdgcn_wmma_f16_16x16x16_f16,     F16_F16_WMMA_w64,1>;
+  defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64",   int_amdgcn_wmma_bf16_16x16x16_bf16,   BF16_BF16_WMMA_w64,1>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64",     int_amdgcn_wmma_i32_16x16x16_iu8,     I32_IU8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64",     int_amdgcn_wmma_i32_16x16x16_iu4,     I32_IU4X16_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64",     int_amdgcn_wmma_i32_16x16x32_iu4,     I32_IU4X32_WMMA_w64>;
+
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w64_twoaddr,     int_amdgcn_swmmac_f32_16x16x32_f16,     F32_F16_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr,    int_amdgcn_swmmac_f32_16x16x32_bf16,    F32_BF16_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w64_twoaddr,     int_amdgcn_swmmac_f16_16x16x32_f16,     F16_F16_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr,   int_amdgcn_swmmac_bf16_16x16x32_bf16,   BF16_BF16_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr,     int_amdgcn_swmmac_i32_16x16x32_iu8,     I32_IU8_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr,     int_amdgcn_swmmac_i32_16x16x32_iu4,     I32_IU4X32_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr,     int_amdgcn_swmmac_i32_16x16x64_iu4,     I32_IU4X64_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Begin Real Encodings
 //===----------------------------------------------------------------------===//
@@ -1005,6 +1400,99 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
     VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
 }
 
+class VOP3PeWmma<bits<7> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
+    : VOP3Pe_gfx11_gfx12<op, P>{
+  // opsel
+  let Inst{11} = !cond(!eq(WMMAP.IndexType, 0)  : 0,
+                       !eq(WMMAP.IndexType, 8)  : index_key_8bit{0},
+                       !eq(WMMAP.IndexType, 16) : index_key_16bit{0});
+  let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
+  let Inst{13} = 0;
+  // opsel_hi
+  let Inst{59} = 1;
+  let Inst{60} = 1;
+  let Inst{14} = 1;
+  // neg_lo
+  let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
+  let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
+  let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0);
+  // neg_hi
+  let Inst{8}  = !if(WMMAP.NegHi01, src0_modifiers{1}, 0);
+  let Inst{9}  = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
+  let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
+  // clamp
+  let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0);
+}
+
+multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<7> op, VOP3PWMMA_Profile WMMAP,
+                                string backing_ps_name = NAME,
+                                string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+  def Gen.Suffix :
+    VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>,
+    VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>;
+}
+
+multiclass VOP3P_Real_WMMA_gfx12 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
+  let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
+    defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
+  }
+}
+
+multiclass VOP3P_Real_WMMA_gfx12w64 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
+  let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX12" in {
+    defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
+  }
+}
+
+defm V_WMMA_F32_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF16_w32    : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
+defm V_WMMA_F16_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
+defm V_WMMA_BF16_16X16X16_BF16_w32   : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU8_w32     : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_I32_16X16X32_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
+
+defm V_WMMA_F32_16X16X16_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF16_w64    : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
+defm V_WMMA_F16_16X16X16_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>;
+defm V_WMMA_BF16_16X16X16_BF16_w64   : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU8_w64     : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_I32_16X16X32_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
+
+
+defm V_SWMMAC_F32_16X16X32_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF16_w32    : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X32_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32   : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU8_w32     : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X64_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF16_w64    : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
+defm V_SWMMAC_F16_16X16X32_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64   : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU8_w64     : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X64_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
+
 multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op,
                           string backing_ps_name = NAME,
                           string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index df505c3365cbde..20d7c88fb7e59f 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -124,6 +124,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   let IsPacked = P.IsPacked;
   let IsMAI = P.IsMAI;
   let IsWMMA = P.IsWMMA;
+  let IsSWMMAC = P.IsSWMMAC;
 
   let AsmOperands = !if(isVop3OpSel,
                         P.AsmVOP3OpSel,
@@ -305,6 +306,11 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
 
 class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
 
+class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+  let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
+  let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
+}
+
 class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
   let Inst{11} = ?;
   let Inst{12} = ?;
@@ -378,6 +384,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   bits<4> src2_modifiers;
   bits<9> src2;
   bits<1> clamp;
+  bits<2> index_key_8bit;
+  bits<1> index_key_16bit;
 
   let Inst{7-0} = vdst;
   let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
@@ -738,7 +746,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
   // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
   let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
-  let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?);
+  let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
   let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
   let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
   let Inst{15}    = !if(P.HasClamp, clamp, 0);
@@ -1406,14 +1414,20 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
   let AsmString = asmName # ps.AsmOperands,
       IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
-  if ps.Pfl.HasOpSel then
-    def _e64#Gen.Suffix :
-      VOP3_Real_Gen<ps, Gen>,
-      VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
-  if !not(ps.Pfl.HasOpSel) then
-    def _e64#Gen.Suffix :
-      VOP3_Real_Gen<ps, Gen>,
-      VOP3e_gfx11_gfx12<op, ps.Pfl>;
+    if ps.Pfl.IsFP8 then {
+      def _e64#Gen.Suffix :
+        VOP3_Real_Gen<ps, Gen>,
+        VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
+    } else {
+      if ps.Pfl.HasOpSel then
+        def _e64#Gen.Suffix :
+          VOP3_Real_Gen<ps, Gen>,
+          VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
+      if !not(ps.Pfl.HasOpSel) then
+        def _e64#Gen.Suffix :
+          VOP3_Real_Gen<ps, Gen>,
+          VOP3e_gfx11_gfx12<op, ps.Pfl>;
+    }
   }
   def Gen.Suffix#"_VOP3_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>, LetDummies;
 }
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index eeb7f64aa5810e..9b54dd4e4e618d 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2781,10 +2781,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
 }
 
-void ARMFrameLowering::processFunctionBeforeFrameFinalized(
-    MachineFunction &MF, RegScavenger *RS) const {
-  TargetFrameLowering::processFunctionBeforeFrameFinalized(MF, RS);
-
+void ARMFrameLowering::updateLRRestored(MachineFunction &MF) {
   MachineFrameInfo &MFI = MF.getFrameInfo();
   if (!MFI.isCalleeSavedInfoValid())
     return;
@@ -2808,6 +2805,12 @@ void ARMFrameLowering::processFunctionBeforeFrameFinalized(
   }
 }
 
+void ARMFrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
+  TargetFrameLowering::processFunctionBeforeFrameFinalized(MF, RS);
+  updateLRRestored(MF);
+}
+
 void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF,
                                       BitVector &SavedRegs) const {
   TargetFrameLowering::getCalleeSaves(MF, SavedRegs);
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
index 8d2b8beb9a58fb..3c7358d8cd53e2 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -59,6 +59,10 @@ class ARMFrameLowering : public TargetFrameLowering {
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
 
+  /// Update the IsRestored flag on LR if it is spilled, based on the return
+  /// instructions.
+  static void updateLRRestored(MachineFunction &MF);
+
   void processFunctionBeforeFrameFinalized(
       MachineFunction &MF, RegScavenger *RS = nullptr) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index ed9d30c3c3ab90..6121055eb02176 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2062,17 +2062,6 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
       MO.setReg(ARM::PC);
       PrevMI.copyImplicitOps(*MBB.getParent(), *MBBI);
       MBB.erase(MBBI);
-      // We now restore LR into PC so it is not live-out of the return block
-      // anymore: Clear the CSI Restored bit.
-      MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
-      // CSI should be fixed after PrologEpilog Insertion
-      assert(MFI.isCalleeSavedInfoValid() && "CSI should be valid");
-      for (CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
-        if (Info.getReg() == ARM::LR) {
-          Info.setRestored(false);
-          break;
-        }
-      }
       return true;
     }
   }
@@ -2120,14 +2109,22 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
-  bool Modified = false;
+  bool Modified = false, ModifiedLDMReturn = false;
   for (MachineBasicBlock &MBB : Fn) {
     Modified |= LoadStoreMultipleOpti(MBB);
     if (STI->hasV5TOps() && !AFI->shouldSignReturnAddress())
-      Modified |= MergeReturnIntoLDM(MBB);
+      ModifiedLDMReturn |= MergeReturnIntoLDM(MBB);
     if (isThumb1)
       Modified |= CombineMovBx(MBB);
   }
+  Modified |= ModifiedLDMReturn;
+
+  // If we merged a BX instruction into an LDM, we need to re-calculate whether
+  // LR is restored. This check needs to consider the whole function, not just
+  // the instruction(s) we changed, because there may be other BX returns which
+  // still need LR to be restored.
+  if (ModifiedLDMReturn)
+    ARMFrameLowering::updateLRRestored(Fn);
 
   Allocator.DestroyAll();
   return Modified;
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 5c1c7046fdbff0..8629551152cb64 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1806,12 +1806,13 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
   PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
   DFS.ProcessLoop();
   const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
-  for (auto *MBB : PostOrder) {
-    recomputeLiveIns(*MBB);
-    // FIXME: For some reason, the live-in print order is non-deterministic for
-    // our tests and I can't out why... So just sort them.
-    MBB->sortUniqueLiveIns();
-  }
+  bool anyChange = false;
+  do {
+    anyChange = false;
+    for (auto *MBB : PostOrder) {
+      anyChange = recomputeLiveIns(*MBB) || anyChange;
+    }
+  } while (anyChange);
 
   for (auto *MBB : reverse(PostOrder))
     recomputeLivenessFlags(*MBB);
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index efaaec32ee6bb1..0a77c7c1d418a1 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -1398,7 +1398,7 @@ let mayLoad = 1, hasSideEffects = 0,
 
 // Load indirect with displacement operations.
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  let Constraints = "@earlyclobber $reg" in def LDDRdPtrQ
+  def LDDRdPtrQ
       : FSTDLDD<0,
                 (outs GPR8
                  : $reg),
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 76c1a14fe0156c..907aae13d6de0c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2343,7 +2343,9 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(
         LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
         DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy),
-        DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
+        DAG.getConstant(ValBits == 32 ? (MaskIdx0 + (MaskLen0 & 31) - 1)
+                                      : (MaskIdx0 + MaskLen0 - 1),
+                        DL, GRLenVT),
         DAG.getConstant(MaskIdx0, DL, GRLenVT));
   }
 
@@ -4940,3 +4942,8 @@ bool LoongArchTargetLowering::hasAndNotCompare(SDValue Y) const {
 
   return !isa<ConstantSDNode>(Y);
 }
+
+ISD::NodeType LoongArchTargetLowering::getExtendForAtomicCmpSwapArg() const {
+  // TODO: LAMCAS will use amcas{_DB,}.[bhwd] which does not require extension.
+  return ISD::SIGN_EXTEND;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 72182623b2c3dd..9e9ac0b8269291 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -206,6 +206,8 @@ class LoongArchTargetLowering : public TargetLowering {
     return ISD::SIGN_EXTEND;
   }
 
+  ISD::NodeType getExtendForAtomicCmpSwapArg() const override;
+
   Register getRegisterByName(const char *RegName, LLT VT,
                              const MachineFunction &MF) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h
index 7d39d47e86b363..fa9bc7608e7d2c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h
@@ -45,6 +45,11 @@ class LoongArchTargetMachine : public LLVMTargetMachine {
   MachineFunctionInfo *
   createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
                             const TargetSubtargetInfo *STI) const override;
+
+  // Addrspacecasts are always noops.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    return true;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index 04349aa52b5408..d47dded9ea6ecf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -21,17 +21,20 @@ using namespace llvm;
 
 TypeSize LoongArchTTIImpl::getRegisterBitWidth(
     TargetTransformInfo::RegisterKind K) const {
+  TypeSize DefSize = TargetTransformInfoImplBase::getRegisterBitWidth(K);
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    if (ST->hasExtLASX() && ST->hasExpAutoVec())
+    if (!ST->hasExpAutoVec())
+      return DefSize;
+    if (ST->hasExtLASX())
       return TypeSize::getFixed(256);
-    if (ST->hasExtLSX() && ST->hasExpAutoVec())
+    if (ST->hasExtLSX())
       return TypeSize::getFixed(128);
-    return TypeSize::getFixed(0);
+    [[fallthrough]];
   case TargetTransformInfo::RGK_ScalableVector:
-    return TypeSize::getScalable(0);
+    return DefSize;
   }
 
   llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 3c673ae938fdec..9d6e8dc573a8d1 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -150,6 +150,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool IsLittleEndian;
   bool IsPicEnabled;
   bool IsCpRestoreSet;
+  bool CurForbiddenSlotAttr;
   int CpRestoreOffset;
   unsigned GPReg;
   unsigned CpSaveLocation;
@@ -552,6 +553,7 @@ class MipsAsmParser : public MCTargetAsmParser {
 
     CurrentFn = nullptr;
 
+    CurForbiddenSlotAttr = false;
     IsPicEnabled = getContext().getObjectFileInfo()->isPositionIndependent();
 
     IsCpRestoreSet = false;
@@ -723,6 +725,16 @@ class MipsAsmParser : public MCTargetAsmParser {
     return getSTI().hasFeature(Mips::FeatureGINV);
   }
 
+  bool hasForbiddenSlot(const MCInstrDesc &MCID) const {
+    return !inMicroMipsMode() && (MCID.TSFlags & MipsII::HasForbiddenSlot);
+  }
+
+  bool SafeInForbiddenSlot(const MCInstrDesc &MCID) const {
+    return !(MCID.TSFlags & MipsII::IsCTI);
+  }
+
+  void onEndOfFile() override;
+
   /// Warn if RegIndex is the same as the current AT.
   void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc);
 
@@ -2307,7 +2319,41 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
 
   bool FillDelaySlot =
       MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder();
-  if (FillDelaySlot)
+
+  // Get previous instruction`s forbidden slot attribute and
+  // whether set reorder.
+  bool PrevForbiddenSlotAttr = CurForbiddenSlotAttr;
+
+  // Flag represents we set reorder after nop.
+  bool SetReorderAfterNop = false;
+
+  // If previous instruction has forbidden slot and .set reorder
+  // is active and current instruction is CTI.
+  // Then emit a NOP after it.
+  if (PrevForbiddenSlotAttr && !SafeInForbiddenSlot(MCID)) {
+    TOut.emitEmptyDelaySlot(false, IDLoc, STI);
+    // When 'FillDelaySlot' is true, the existing logic will add
+    // noreorder before instruction and reorder after it. So there
+    // need exclude this case avoiding two '.set reorder'.
+    // The format of the first case is:
+    // .set noreorder
+    // bnezc
+    // nop
+    // .set reorder
+    if (AssemblerOptions.back()->isReorder() && !FillDelaySlot) {
+      SetReorderAfterNop = true;
+      TOut.emitDirectiveSetReorder();
+    }
+  }
+
+  // Save current instruction`s forbidden slot and whether set reorder.
+  // This is the judgment condition for whether to add nop.
+  // We would add a couple of '.set noreorder' and '.set reorder' to
+  // wrap the current instruction and the next instruction.
+  CurForbiddenSlotAttr =
+      hasForbiddenSlot(MCID) && AssemblerOptions.back()->isReorder();
+
+  if (FillDelaySlot || CurForbiddenSlotAttr)
     TOut.emitDirectiveSetNoReorder();
 
   MacroExpanderResultTy ExpandResult =
@@ -2322,6 +2368,17 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     return true;
   }
 
+  // When current instruction was not CTI, recover reorder state.
+  // The format of the second case is:
+  // .set noreoder
+  // bnezc
+  // add
+  // .set reorder
+  if (PrevForbiddenSlotAttr && !SetReorderAfterNop && !FillDelaySlot &&
+      AssemblerOptions.back()->isReorder()) {
+    TOut.emitDirectiveSetReorder();
+  }
+
   // We know we emitted an instruction on the MER_NotAMacro or MER_Success path.
   // If we're in microMIPS mode then we must also set EF_MIPS_MICROMIPS.
   if (inMicroMipsMode()) {
@@ -2331,6 +2388,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
 
   // If this instruction has a delay slot and .set reorder is active,
   // emit a NOP after it.
+  // The format of the third case is:
+  // .set noreorder
+  // bnezc
+  // nop
+  // .set noreorder
+  // j
+  // nop
+  // .set reorder
   if (FillDelaySlot) {
     TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst), IDLoc, STI);
     TOut.emitDirectiveSetReorder();
@@ -2356,6 +2421,17 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   return false;
 }
 
+void MipsAsmParser::onEndOfFile() {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  SMLoc IDLoc = SMLoc();
+  // If has pending forbidden slot, fill nop and recover reorder.
+  if (CurForbiddenSlotAttr) {
+    TOut.emitEmptyDelaySlot(false, IDLoc, STI);
+    if (AssemblerOptions.back()->isReorder())
+      TOut.emitDirectiveSetReorder();
+  }
+}
+
 MipsAsmParser::MacroExpanderResultTy
 MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                     const MCSubtargetInfo *STI) {
@@ -2920,6 +2996,11 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
         (Res.getSymA()->getSymbol().isELF() &&
          cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
              ELF::STB_LOCAL);
+    // For O32, "$"-prefixed symbols are recognized as temporary while
+    // .L-prefixed symbols are not (PrivateGlobalPrefix is "$"). Recognize ".L"
+    // manually.
+    if (ABI.IsO32() && Res.getSymA()->getSymbol().getName().starts_with(".L"))
+      IsLocalSym = true;
     bool UseXGOT = STI->hasFeature(Mips::FeatureXGOT) && !IsLocalSym;
 
     // The case where the result register is $25 is somewhat special. If the
@@ -6359,7 +6440,7 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
       return true;
 
     SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    MCSymbol *Sym = getContext().getOrCreateSymbol("$" + Identifier);
+    MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
     // Otherwise create a symbol reference.
     const MCExpr *SymRef =
         MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 27d7f0f261d100..adfcea73615831 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1255,7 +1255,9 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
     emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
 
-  if (getABI().IsN32()) {
+#if 0
+  // We haven't support -mabicalls -mno-shared yet.
+  if (-mno-shared) {
     MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp");
     const MipsMCExpr *HiExpr = MipsMCExpr::create(
         MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
@@ -1273,6 +1275,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
 
     return;
   }
+#endif
 
   const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
       MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
@@ -1288,8 +1291,11 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(),
           &STI);
 
-  // daddu  $gp, $gp, $funcreg
-  emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+  // (d)addu  $gp, $gp, $funcreg
+  if (getABI().IsN32())
+    emitRRR(Mips::ADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+  else
+    emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index 854563ab32bd8e..3ef04e488f016f 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -152,15 +152,15 @@ class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>;
 
 class LWPC_ENC   : PCREL19_FM<OPCODE2_LWPC>;
 
-class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
-class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
+class MAX_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
+class MAX_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
 class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>;
 class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>;
 
 class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>;
 class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>;
-class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
-class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
+class MINA_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
+class MINA_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
 
 class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>;
 class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>;
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 718844bc36ff93..66b2b0de8d52a3 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -471,45 +471,6 @@ void MipsAsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
     TS.emitDirectiveInsn();
 }
 
-/// isBlockOnlyReachableByFallthough - Return true if the basic block has
-/// exactly one predecessor and the control transfer mechanism between
-/// the predecessor and this block is a fall-through.
-bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
-                                                       MBB) const {
-  // The predecessor has to be immediately before this block.
-  const MachineBasicBlock *Pred = *MBB->pred_begin();
-
-  // If the predecessor is a switch statement, assume a jump table
-  // implementation, so it is not a fall through.
-  if (const BasicBlock *bb = Pred->getBasicBlock())
-    if (isa<SwitchInst>(bb->getTerminator()))
-      return false;
-
-  // If this is a landing pad, it isn't a fall through.  If it has no preds,
-  // then nothing falls through to it.
-  if (MBB->isEHPad() || MBB->pred_empty())
-    return false;
-
-  // If there isn't exactly one predecessor, it can't be a fall through.
-  if (MBB->pred_size() != 1)
-    return false;
-
-  // The predecessor has to be immediately before this block.
-  if (!Pred->isLayoutSuccessor(MBB))
-    return false;
-
-  // If the block is completely empty, then it definitely does fall through.
-  if (Pred->empty())
-    return true;
-
-  // Otherwise, check the last instruction.
-  // Check if the last terminator is an unconditional branch.
-  MachineBasicBlock::const_iterator I = Pred->end();
-  while (I != Pred->begin() && !(--I)->isTerminator()) ;
-
-  return !I->isBarrier();
-}
-
 // Print out an operand for an inline asm expression.
 bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                      const char *ExtraCode, raw_ostream &O) {
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h
index 64424b181504a7..0b55089385d79d 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.h
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -142,8 +142,6 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
   void emitFunctionBodyStart() override;
   void emitFunctionBodyEnd() override;
   void emitBasicBlockEnd(const MachineBasicBlock &MBB) override;
-  bool isBlockOnlyReachableByFallthrough(
-                                   const MachineBasicBlock* MBB) const override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        const char *ExtraCode, raw_ostream &O) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index d431d3d91494f6..88b226eaaccfab 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -4128,14 +4128,18 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
     case 'y': // Same as 'r'. Exists for compatibility.
     case 'r':
-      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 || VT == MVT::i1) {
+      if ((VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 ||
+           VT == MVT::i1) ||
+          (VT == MVT::f32 && Subtarget.useSoftFloat())) {
         if (Subtarget.inMips16Mode())
           return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::GPR32RegClass);
       }
-      if (VT == MVT::i64 && !Subtarget.isGP64bit())
+      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) &&
+          !Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR32RegClass);
-      if (VT == MVT::i64 && Subtarget.isGP64bit())
+      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) &&
+          Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
       return std::make_pair(0U, nullptr);
diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
index aee57a5075ff71..b43eee8fdd8c0f 100644
--- a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
@@ -208,8 +208,10 @@ bool PPCExpandAtomicPseudo::expandAtomicRMW128(
       .addMBB(LoopMBB);
   CurrentMBB->addSuccessor(LoopMBB);
   CurrentMBB->addSuccessor(ExitMBB);
-  recomputeLiveIns(*LoopMBB);
-  recomputeLiveIns(*ExitMBB);
+  bool anyChange = false;
+  do {
+    anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
+  } while (anyChange);
   NMBBI = MBB.end();
   MI.eraseFromParent();
   return true;
@@ -286,9 +288,11 @@ bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128(
   CurrentMBB->addSuccessor(LoopCmpMBB);
   CurrentMBB->addSuccessor(ExitMBB);
 
-  recomputeLiveIns(*LoopCmpMBB);
-  recomputeLiveIns(*CmpSuccMBB);
-  recomputeLiveIns(*ExitMBB);
+  bool anyChange = false;
+  do {
+    anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*CmpSuccMBB) ||
+                recomputeLiveIns(*LoopCmpMBB);
+  } while (anyChange);
   NMBBI = MBB.end();
   MI.eraseFromParent();
   return true;
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 245e78641ed654..424501c35c043c 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1191,12 +1191,6 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
       if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC)
         continue;
 
-      // For SVR4, don't emit a move for the CR spill slot if we haven't
-      // spilled CRs.
-      if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
-          && !MustSaveCR)
-        continue;
-
       // For 64-bit SVR4 when we have spilled CRs, the spill location
       // is SP+8, not a frame-relative slot.
       if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
@@ -1441,8 +1435,11 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
       ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB);
     }
     // Update liveins.
-    recomputeLiveIns(*ProbeLoopBodyMBB);
-    recomputeLiveIns(*ProbeExitMBB);
+    bool anyChange = false;
+    do {
+      anyChange = recomputeLiveIns(*ProbeExitMBB) ||
+                  recomputeLiveIns(*ProbeLoopBodyMBB);
+    } while (anyChange);
     return ProbeExitMBB;
   };
   // For case HasBP && MaxAlign > 1, we have to realign the SP by performing
@@ -1534,8 +1531,10 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
         buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
       }
       // Update liveins.
-      recomputeLiveIns(*LoopMBB);
-      recomputeLiveIns(*ExitMBB);
+      bool anyChange = false;
+      do {
+        anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
+      } while (anyChange);
     }
   }
   ++NumPrologProbed;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 26ed74108ec36c..18a4223d481ef0 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -1635,7 +1635,8 @@ class BitPermutationSelector {
     default: break;
     case ISD::ROTL:
       if (isa<ConstantSDNode>(V.getOperand(1))) {
-        unsigned RotAmt = V.getConstantOperandVal(1);
+        assert(isPowerOf2_32(NumBits) && "rotl bits should be power of 2!");
+        unsigned RotAmt = V.getConstantOperandVal(1) & (NumBits - 1);
 
         const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
 
@@ -1648,15 +1649,20 @@ class BitPermutationSelector {
     case ISD::SHL:
     case PPCISD::SHL:
       if (isa<ConstantSDNode>(V.getOperand(1))) {
-        unsigned ShiftAmt = V.getConstantOperandVal(1);
+        // sld takes 7 bits, slw takes 6.
+        unsigned ShiftAmt = V.getConstantOperandVal(1) & ((NumBits << 1) - 1);
 
         const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
 
-        for (unsigned i = ShiftAmt; i < NumBits; ++i)
-          Bits[i] = LHSBits[i - ShiftAmt];
-
-        for (unsigned i = 0; i < ShiftAmt; ++i)
-          Bits[i] = ValueBit(ValueBit::ConstZero);
+        if (ShiftAmt >= NumBits) {
+          for (unsigned i = 0; i < NumBits; ++i)
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        } else {
+          for (unsigned i = ShiftAmt; i < NumBits; ++i)
+            Bits[i] = LHSBits[i - ShiftAmt];
+          for (unsigned i = 0; i < ShiftAmt; ++i)
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        }
 
         return std::make_pair(Interesting = true, &Bits);
       }
@@ -1664,15 +1670,20 @@ class BitPermutationSelector {
     case ISD::SRL:
     case PPCISD::SRL:
       if (isa<ConstantSDNode>(V.getOperand(1))) {
-        unsigned ShiftAmt = V.getConstantOperandVal(1);
+        // srd takes lowest 7 bits, srw takes 6.
+        unsigned ShiftAmt = V.getConstantOperandVal(1) & ((NumBits << 1) - 1);
 
         const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
 
-        for (unsigned i = 0; i < NumBits - ShiftAmt; ++i)
-          Bits[i] = LHSBits[i + ShiftAmt];
-
-        for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i)
-          Bits[i] = ValueBit(ValueBit::ConstZero);
+        if (ShiftAmt >= NumBits) {
+          for (unsigned i = 0; i < NumBits; ++i)
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        } else {
+          for (unsigned i = 0; i < NumBits - ShiftAmt; ++i)
+            Bits[i] = LHSBits[i + ShiftAmt];
+          for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i)
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        }
 
         return std::make_pair(Interesting = true, &Bits);
       }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index aec58d1c0dcb9f..85f1e670045b92 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14942,6 +14942,7 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
     SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
                                          DAG.getVTList(MVT::f64, MVT::Other),
                                          Ops, MVT::i8, LDN->getMemOperand());
+    DAG.makeEquivalentMemoryOrdering(LDN, Ld);
 
     // For signed conversion, we need to sign-extend the value in the VSR
     if (Signed) {
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index a0c3345ec1bbd7..ac88cd49db4e4b 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_TARGET_DEFINITIONS RISCV.td)
 tablegen(LLVM RISCVGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter)
+tablegen(LLVM RISCVGenMacroFusion.inc -gen-macro-fusion-pred)
 tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
@@ -43,7 +44,6 @@ add_llvm_target(RISCVCodeGen
   RISCVISelDAGToDAG.cpp
   RISCVISelLowering.cpp
   RISCVMachineFunctionInfo.cpp
-  RISCVMacroFusion.cpp
   RISCVMergeBaseOffset.cpp
   RISCVOptWInstrs.cpp
   RISCVPostRAExpandPseudoInsts.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index e6e879282241dd..27d52c16a4f39d 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -30,6 +30,12 @@ include "RISCVCallingConv.td"
 include "RISCVInstrInfo.td"
 include "GISel/RISCVRegisterBanks.td"
 
+//===----------------------------------------------------------------------===//
+// RISC-V macro fusions.
+//===----------------------------------------------------------------------===//
+
+include "RISCVMacroFusion.td"
+
 //===----------------------------------------------------------------------===//
 // RISC-V Scheduling Models
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3878be680c0492..26451c80f57b42 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -72,7 +72,7 @@ def FeatureStdExtZicntr
                        [FeatureStdExtZicsr]>;
 
 def FeatureStdExtZicond
-    : SubtargetFeature<"experimental-zicond", "HasStdExtZicond", "true",
+    : SubtargetFeature<"zicond", "HasStdExtZicond", "true",
                        "'Zicond' (Integer Conditional Operations)">;
 def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
                       AssemblerPredicate<(all_of FeatureStdExtZicond),
@@ -1044,30 +1044,6 @@ def TuneDLenFactor2
    : SubtargetFeature<"dlen-factor-2", "DLenFactor2", "true",
                       "Vector unit DLEN(data path width) is half of VLEN">;
 
-def TuneLUIADDIFusion
-    : SubtargetFeature<"lui-addi-fusion", "HasLUIADDIFusion",
-                       "true", "Enable LUI+ADDI macrofusion">;
-
-def TuneAUIPCADDIFusion
-    : SubtargetFeature<"auipc-addi-fusion", "HasAUIPCADDIFusion",
-                       "true", "Enable AUIPC+ADDI macrofusion">;
-
-def TuneZExtHFusion
-    : SubtargetFeature<"zexth-fusion", "HasZExtHFusion",
-                       "true", "Enable SLLI+SRLI to be fused to zero extension of halfword">;
-
-def TuneZExtWFusion
-    : SubtargetFeature<"zextw-fusion", "HasZExtWFusion",
-                       "true", "Enable SLLI+SRLI to be fused to zero extension of word">;
-
-def TuneShiftedZExtWFusion
-    : SubtargetFeature<"shifted-zextw-fusion", "HasShiftedZExtWFusion",
-                       "true", "Enable SLLI+SRLI to be fused when computing (shifted) zero extension of word">;
-
-def TuneLDADDFusion
-    : SubtargetFeature<"ld-add-fusion", "HasLDADDFusion",
-                       "true", "Enable LD+ADD macrofusion.">;
-
 def TuneNoDefaultUnroll
     : SubtargetFeature<"no-default-unroll", "EnableDefaultUnroll", "false",
                        "Disable default unroll preference.">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 47c6cd6e5487b8..d46093b9e260a2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3192,7 +3192,8 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF,
 // Note that this method will also match potentially unappealing index
 // sequences, like <i32 0, i32 50939494>, however it is left to the caller to
 // determine whether this is worth generating code for.
-static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
+static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
+                                                      unsigned EltSizeInBits) {
   unsigned NumElts = Op.getNumOperands();
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR");
   bool IsInteger = Op.getValueType().isInteger();
@@ -3200,7 +3201,7 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
   std::optional<unsigned> SeqStepDenom;
   std::optional<int64_t> SeqStepNum, SeqAddend;
   std::optional<std::pair<uint64_t, unsigned>> PrevElt;
-  unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits();
+  assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits());
   for (unsigned Idx = 0; Idx < NumElts; Idx++) {
     // Assume undef elements match the sequence; we just have to be careful
     // when interpolating across them.
@@ -3213,14 +3214,14 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
       if (!isa<ConstantSDNode>(Op.getOperand(Idx)))
         return std::nullopt;
       Val = Op.getConstantOperandVal(Idx) &
-            maskTrailingOnes<uint64_t>(EltSizeInBits);
+            maskTrailingOnes<uint64_t>(Op.getScalarValueSizeInBits());
     } else {
       // The BUILD_VECTOR must be all constants.
       if (!isa<ConstantFPSDNode>(Op.getOperand(Idx)))
         return std::nullopt;
       if (auto ExactInteger = getExactInteger(
               cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
-              EltSizeInBits))
+              Op.getScalarValueSizeInBits()))
         Val = *ExactInteger;
       else
         return std::nullopt;
@@ -3276,11 +3277,11 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
     uint64_t Val;
     if (IsInteger) {
       Val = Op.getConstantOperandVal(Idx) &
-            maskTrailingOnes<uint64_t>(EltSizeInBits);
+            maskTrailingOnes<uint64_t>(Op.getScalarValueSizeInBits());
     } else {
       Val = *getExactInteger(
           cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
-          EltSizeInBits);
+          Op.getScalarValueSizeInBits());
     }
     uint64_t ExpectedVal =
         (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
@@ -3550,7 +3551,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
   // Try and match index sequences, which we can lower to the vid instruction
   // with optional modifications. An all-undef vector is matched by
   // getSplatValue, above.
-  if (auto SimpleVID = isSimpleVIDSequence(Op)) {
+  if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) {
     int64_t StepNumerator = SimpleVID->StepNumerator;
     unsigned StepDenominator = SimpleVID->StepDenominator;
     int64_t Addend = SimpleVID->Addend;
@@ -4718,7 +4719,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
     if (SrcVecIdx == -1)
       continue;
     unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
-    SDValue SrcVec = (unsigned)SrcVecIdx > VRegsPerSrc ? V2 : V1;
+    SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
     SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
                                  DAG.getVectorIdxConstant(ExtractIdx, DL));
     SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
@@ -5033,60 +5034,56 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   MVT IndexContainerVT =
       ContainerVT.changeVectorElementType(IndexVT.getScalarType());
 
-  // Base case for the recursion just below - handle the worst case
-  // single source permutation.  Note that all the splat variants
-  // are handled above.
-  if (V2.isUndef()) {
+  SDValue Gather;
+  // TODO: This doesn't trigger for i64 vectors on RV32, since there we
+  // encounter a bitcasted BUILD_VECTOR with low/high i32 values.
+  if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
+    Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG,
+                              Subtarget);
+  } else {
     V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
-    SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
-    LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
-                                         Subtarget);
-    SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
-                                 DAG.getUNDEF(ContainerVT), TrueMask, VL);
-    return convertFromScalableVector(VT, Gather, DAG, Subtarget);
-  }
-
-  // Translate the gather index we computed above (and possibly swapped)
-  // back to a shuffle mask.  This step should disappear once we complete
-  // the migration to recursive design.
-  SmallVector<int> ShuffleMaskLHS;
-  ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
-  for (SDValue GatherIndex : GatherIndicesLHS) {
-    if (GatherIndex.isUndef()) {
-      ShuffleMaskLHS.push_back(-1);
-      continue;
+    // If only one index is used, we can use a "splat" vrgather.
+    // TODO: We can splat the most-common index and fix-up any stragglers, if
+    // that's beneficial.
+    if (LHSIndexCounts.size() == 1) {
+      int SplatIndex = LHSIndexCounts.begin()->getFirst();
+      Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
+                           DAG.getConstant(SplatIndex, DL, XLenVT),
+                           DAG.getUNDEF(ContainerVT), TrueMask, VL);
+    } else {
+      SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+      LHSIndices =
+          convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
+
+      Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
+                           DAG.getUNDEF(ContainerVT), TrueMask, VL);
     }
-    auto *IdxC = cast<ConstantSDNode>(GatherIndex);
-    ShuffleMaskLHS.push_back(IdxC->getZExtValue());
   }
 
-  // Recursively invoke lowering for the LHS as if there were no RHS.
-  // This allows us to leverage all of our single source permute tricks.
-  SDValue Gather =
-    DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
-  Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);
+  // If a second vector operand is used by this shuffle, blend it in with an
+  // additional vrgather.
+  if (!V2.isUndef()) {
+    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
 
-  // Blend in second vector source with an additional vrgather.
-  V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+    MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
+    SelectMask =
+        convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
 
-  MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
-  SelectMask =
-    convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
-
-  // If only one index is used, we can use a "splat" vrgather.
-  // TODO: We can splat the most-common index and fix-up any stragglers, if
-  // that's beneficial.
-  if (RHSIndexCounts.size() == 1) {
-    int SplatIndex = RHSIndexCounts.begin()->getFirst();
-    Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
-                         DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
-                         SelectMask, VL);
-  } else {
-    SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
-    RHSIndices =
-      convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
-    Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
-                         SelectMask, VL);
+    // If only one index is used, we can use a "splat" vrgather.
+    // TODO: We can splat the most-common index and fix-up any stragglers, if
+    // that's beneficial.
+    if (RHSIndexCounts.size() == 1) {
+      int SplatIndex = RHSIndexCounts.begin()->getFirst();
+      Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
+                           DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
+                           SelectMask, VL);
+    } else {
+      SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
+      RHSIndices =
+          convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
+      Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
+                           SelectMask, VL);
+    }
   }
 
   return convertFromScalableVector(VT, Gather, DAG, Subtarget);
@@ -14562,7 +14559,7 @@ static SDValue tryFoldSelectIntoOp(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue OtherOp = TrueVal.getOperand(1 - OpToFold);
-  EVT OtherOpVT = OtherOp->getValueType(0);
+  EVT OtherOpVT = OtherOp.getValueType();
   SDValue IdentityOperand =
       DAG.getNeutralElement(Opc, DL, OtherOpVT, N->getFlags());
   if (!Commutative)
@@ -14658,8 +14655,8 @@ static SDValue useInversedSetcc(SDNode *N, SelectionDAG &DAG,
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
     if (CC == ISD::SETEQ && LHS.getOpcode() == ISD::AND &&
         isa<ConstantSDNode>(LHS.getOperand(1)) && isNullConstant(RHS)) {
-      uint64_t MaskVal = LHS.getConstantOperandVal(1);
-      if (isPowerOf2_64(MaskVal) && !isInt<12>(MaskVal))
+      const APInt &MaskVal = LHS.getConstantOperandAPInt(1);
+      if (MaskVal.isPowerOf2() && !MaskVal.isSignedIntN(12))
         return DAG.getSelect(DL, VT,
                              DAG.getSetCC(DL, CondVT, LHS, RHS, ISD::SETNE),
                              False, True);
@@ -15565,8 +15562,11 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
           MGN->getMemOperand(), IndexType, MGN->getExtensionType());
 
     if (Index.getOpcode() == ISD::BUILD_VECTOR &&
-        MGN->getExtensionType() == ISD::NON_EXTLOAD) {
-      if (std::optional<VIDSequence> SimpleVID = isSimpleVIDSequence(Index);
+        MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) {
+      // The sequence will be XLenVT, not the type of Index. Tell
+      // isSimpleVIDSequence this so we avoid overflow.
+      if (std::optional<VIDSequence> SimpleVID =
+              isSimpleVIDSequence(Index, Subtarget.getXLen());
           SimpleVID && SimpleVID->StepDenominator == 1) {
         const int64_t StepNumerator = SimpleVID->StepNumerator;
         const int64_t Addend = SimpleVID->Addend;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 592962cebe8973..d5b1ddfbeb3dc9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1229,7 +1229,8 @@ bool RISCVInstrInfo::optimizeCondBranch(MachineInstr &MI) const {
     MachineBasicBlock::reverse_iterator II(&MI), E = MBB->rend();
     auto DefC1 = std::find_if(++II, E, [&](const MachineInstr &I) -> bool {
       int64_t Imm;
-      return isLoadImm(&I, Imm) && Imm == C1;
+      return isLoadImm(&I, Imm) && Imm == C1 &&
+             I.getOperand(0).getReg().isVirtual();
     });
     if (DefC1 != E)
       return DefC1->getOperand(0).getReg();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td
index 0790a941823b1a..35d3fdae0bd79b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td
@@ -8,8 +8,6 @@
 //
 // This file describes the RISC-V instructions from the standard Integer
 // Conditional operations extension (Zicond).
-// This version is still experimental as the 'Zicond' extension hasn't been
-// ratified yet. It is based on v1.0-rc1 of the specification.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
deleted file mode 100644
index f948f05b22f772..00000000000000
--- a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-//===- RISCVMacroFusion.cpp - RISC-V Macro Fusion -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This file contains the RISC-V implementation of the DAG scheduling
-/// mutation to pair instructions back to back.
-//
-//===----------------------------------------------------------------------===//
-//
-#include "RISCVMacroFusion.h"
-#include "RISCVSubtarget.h"
-#include "llvm/CodeGen/MacroFusion.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-
-using namespace llvm;
-
-static bool checkRegisters(Register FirstDest, const MachineInstr &SecondMI) {
-  if (!SecondMI.getOperand(1).isReg())
-    return false;
-
-  if (SecondMI.getOperand(1).getReg() != FirstDest)
-    return false;
-
-  // If the input is virtual make sure this is the only user.
-  if (FirstDest.isVirtual()) {
-    auto &MRI = SecondMI.getMF()->getRegInfo();
-    return MRI.hasOneNonDBGUse(FirstDest);
-  }
-
-  return SecondMI.getOperand(0).getReg() == FirstDest;
-}
-
-// Fuse load with add:
-// add rd, rs1, rs2
-// ld rd, 0(rd)
-static bool isLDADD(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
-  if (SecondMI.getOpcode() != RISCV::LD)
-    return false;
-
-  if (!SecondMI.getOperand(2).isImm())
-    return false;
-
-  if (SecondMI.getOperand(2).getImm() != 0)
-    return false;
-
-  // Given SecondMI, when FirstMI is unspecified, we must return
-  // if SecondMI may be part of a fused pair at all.
-  if (!FirstMI)
-    return true;
-
-  if (FirstMI->getOpcode() != RISCV::ADD)
-    return true;
-
-  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse zero extension of halfword:
-// slli rd, rs1, 48
-// srli rd, rd, 48
-static bool isZExtH(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
-  if (SecondMI.getOpcode() != RISCV::SRLI)
-    return false;
-
-  if (!SecondMI.getOperand(2).isImm())
-    return false;
-
-  if (SecondMI.getOperand(2).getImm() != 48)
-    return false;
-
-  // Given SecondMI, when FirstMI is unspecified, we must return
-  // if SecondMI may be part of a fused pair at all.
-  if (!FirstMI)
-    return true;
-
-  if (FirstMI->getOpcode() != RISCV::SLLI)
-    return false;
-
-  if (FirstMI->getOperand(2).getImm() != 48)
-    return false;
-
-  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse zero extension of word:
-// slli rd, rs1, 32
-// srli rd, rd, 32
-static bool isZExtW(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
-  if (SecondMI.getOpcode() != RISCV::SRLI)
-    return false;
-
-  if (!SecondMI.getOperand(2).isImm())
-    return false;
-
-  if (SecondMI.getOperand(2).getImm() != 32)
-    return false;
-
-  // Given SecondMI, when FirstMI is unspecified, we must return
-  // if SecondMI may be part of a fused pair at all.
-  if (!FirstMI)
-    return true;
-
-  if (FirstMI->getOpcode() != RISCV::SLLI)
-    return false;
-
-  if (FirstMI->getOperand(2).getImm() != 32)
-    return false;
-
-  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse shifted zero extension of word:
-// slli rd, rs1, 32
-// srli rd, rd, x
-// where 0 <= x < 32
-static bool isShiftedZExtW(const MachineInstr *FirstMI,
-                           const MachineInstr &SecondMI) {
-  if (SecondMI.getOpcode() != RISCV::SRLI)
-    return false;
-
-  if (!SecondMI.getOperand(2).isImm())
-    return false;
-
-  unsigned SRLIImm = SecondMI.getOperand(2).getImm();
-  if (SRLIImm >= 32)
-    return false;
-
-  // Given SecondMI, when FirstMI is unspecified, we must return
-  // if SecondMI may be part of a fused pair at all.
-  if (!FirstMI)
-    return true;
-
-  if (FirstMI->getOpcode() != RISCV::SLLI)
-    return false;
-
-  if (FirstMI->getOperand(2).getImm() != 32)
-    return false;
-
-  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse AUIPC followed by ADDI
-// auipc rd, imm20
-// addi rd, rd, imm12
-static bool isAUIPCADDI(const MachineInstr *FirstMI,
-                        const MachineInstr &SecondMI) {
-  if (SecondMI.getOpcode() != RISCV::ADDI)
-    return false;
-  // Assume the 1st instr to be a wildcard if it is unspecified.
-  if (!FirstMI)
-    return true;
-
-  if (FirstMI->getOpcode() != RISCV::AUIPC)
-    return false;
-
-  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse LUI followed by ADDI or ADDIW.
-// rd = imm[31:0] which decomposes to
-// lui rd, imm[31:12]
-// addi(w) rd, rd, imm[11:0]
-static bool isLUIADDI(const MachineInstr *FirstMI,
-                      const MachineInstr &SecondMI) {
-  if (SecondMI.getOpcode() != RISCV::ADDI &&
-      SecondMI.getOpcode() != RISCV::ADDIW)
-    return false;
-  // Assume the 1st instr to be a wildcard if it is unspecified.
-  if (!FirstMI)
-    return true;
-
-  if (FirstMI->getOpcode() != RISCV::LUI)
-    return false;
-
-  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
-                                   const TargetSubtargetInfo &TSI,
-                                   const MachineInstr *FirstMI,
-                                   const MachineInstr &SecondMI) {
-  const RISCVSubtarget &ST = static_cast<const RISCVSubtarget &>(TSI);
-
-  if (ST.hasLUIADDIFusion() && isLUIADDI(FirstMI, SecondMI))
-    return true;
-
-  if (ST.hasAUIPCADDIFusion() && isAUIPCADDI(FirstMI, SecondMI))
-    return true;
-
-  if (ST.hasZExtHFusion() && isZExtH(FirstMI, SecondMI))
-    return true;
-
-  if (ST.hasZExtWFusion() && isZExtW(FirstMI, SecondMI))
-    return true;
-
-  if (ST.hasShiftedZExtWFusion() && isShiftedZExtW(FirstMI, SecondMI))
-    return true;
-
-  if (ST.hasLDADDFusion() && isLDADD(FirstMI, SecondMI))
-    return true;
-
-  return false;
-}
-
-std::unique_ptr<ScheduleDAGMutation> llvm::createRISCVMacroFusionDAGMutation() {
-  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
-}
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.h b/llvm/lib/Target/RISCV/RISCVMacroFusion.h
deleted file mode 100644
index 7598db3f8fe143..00000000000000
--- a/llvm/lib/Target/RISCV/RISCVMacroFusion.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//===- RISCVMacroFusion.h - RISC-V Macro Fusion -----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This file contains the RISC-V definition of the DAG scheduling
-/// mutation to pair instructions back to back.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H
-#define LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H
-
-#include "llvm/CodeGen/MachineScheduler.h"
-
-namespace llvm {
-
-/// Note that you have to add:
-///   DAG.addMutation(createRISCVMacroFusionDAGMutation());
-/// to RISCVPassConfig::createMachineScheduler() to have an effect.
-std::unique_ptr<ScheduleDAGMutation> createRISCVMacroFusionDAGMutation();
-
-} // namespace llvm
-
-#endif
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.td b/llvm/lib/Target/RISCV/RISCVMacroFusion.td
new file mode 100644
index 00000000000000..875a93d09a2c64
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.td
@@ -0,0 +1,93 @@
+//==----- RISCVMacroFusion.td - Macro Fusion Definitions -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the macro fusion predicators.
+
+// Fuse LUI followed by ADDI or ADDIW:
+//   rd = imm[31:0] which decomposes to
+//   lui rd, imm[31:12]
+//   addi(w) rd, rd, imm[11:0]
+def TuneLUIADDIFusion
+  : SimpleFusion<"lui-addi-fusion", "HasLUIADDIFusion",
+                 "Enable LUI+ADDI macro fusion",
+                 CheckOpcode<[LUI]>,
+                 CheckOpcode<[ADDI, ADDIW]>>;
+
+// Fuse AUIPC followed by ADDI:
+//   auipc rd, imm20
+//   addi rd, rd, imm12
+def TuneAUIPCADDIFusion
+  : SimpleFusion<"auipc-addi-fusion", "HasAUIPCADDIFusion",
+                 "Enable AUIPC+ADDI macrofusion",
+                 CheckOpcode<[AUIPC]>,
+                 CheckOpcode<[ADDI]>>;
+
+// Fuse zero extension of halfword:
+//   slli rd, rs1, 48
+//   srli rd, rd, 48
+def TuneZExtHFusion
+  : SimpleFusion<"zexth-fusion", "HasZExtHFusion",
+                 "Enable SLLI+SRLI to be fused to zero extension of halfword",
+                 CheckAll<[
+                   CheckOpcode<[SLLI]>,
+                   CheckIsImmOperand<2>,
+                   CheckImmOperand<2, 48>
+                 ]>,
+                 CheckAll<[
+                   CheckOpcode<[SRLI]>,
+                   CheckIsImmOperand<2>,
+                   CheckImmOperand<2, 48>
+                 ]>>;
+
+// Fuse zero extension of word:
+//   slli rd, rs1, 32
+//   srli rd, rd, 32
+def TuneZExtWFusion
+  : SimpleFusion<"zextw-fusion", "HasZExtWFusion",
+                 "Enable SLLI+SRLI to be fused to zero extension of word",
+                 CheckAll<[
+                   CheckOpcode<[SLLI]>,
+                   CheckIsImmOperand<2>,
+                   CheckImmOperand<2, 32>
+                 ]>,
+                 CheckAll<[
+                   CheckOpcode<[SRLI]>,
+                   CheckIsImmOperand<2>,
+                   CheckImmOperand<2, 32>
+                 ]>>;
+
+// Fuse shifted zero extension of word:
+//   slli rd, rs1, 32
+//   srli rd, rd, x
+//   where 0 <= x < 32
+def TuneShiftedZExtWFusion
+  : SimpleFusion<"shifted-zextw-fusion", "HasShiftedZExtWFusion",
+                 "Enable SLLI+SRLI to be fused when computing (shifted) word zero extension",
+                 CheckAll<[
+                   CheckOpcode<[SLLI]>,
+                   CheckIsImmOperand<2>,
+                   CheckImmOperand<2, 32>
+                 ]>,
+                 CheckAll<[
+                   CheckOpcode<[SRLI]>,
+                   CheckIsImmOperand<2>,
+                   CheckImmOperandRange<2, 0, 31>
+                 ]>>;
+
+// Fuse load with add:
+//   add rd, rs1, rs2
+//   ld rd, 0(rd)
+def TuneLDADDFusion
+  : SimpleFusion<"ld-add-fusion", "HasLDADDFusion", "Enable LD+ADD macrofusion",
+                 CheckOpcode<[ADD]>,
+                 CheckAll<[
+                   CheckOpcode<[LD]>,
+                   CheckIsImmOperand<2>,
+                   CheckImmOperand<2, 0>
+                 ]>>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 7b64d3cee9c800..d3236bb07d56d5 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -16,8 +16,9 @@
 #include "GISel/RISCVRegisterBankInfo.h"
 #include "RISCV.h"
 #include "RISCVFrameLowering.h"
-#include "RISCVMacroFusion.h"
 #include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -29,6 +30,9 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "RISCVGenSubtargetInfo.inc"
 
+#define GET_RISCV_MACRO_FUSION_PRED_IMPL
+#include "RISCVGenMacroFusion.inc"
+
 namespace llvm::RISCVTuneInfoTable {
 
 #define GET_RISCVTuneInfoTable_IMPL
@@ -187,7 +191,7 @@ bool RISCVSubtarget::enableSubRegLiveness() const {
 
 void RISCVSubtarget::getPostRAMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(createRISCVMacroFusionDAGMutation());
+  Mutations.push_back(createMacroFusionDAGMutation(getMacroFusions()));
 }
 
   /// Enable use of alias analysis during code generation (during MI
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 2ba93764facd07..8c55efa69a6a5f 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -27,6 +27,9 @@
 #include "llvm/Target/TargetMachine.h"
 #include <bitset>
 
+#define GET_RISCV_MACRO_FUSION_PRED_DECL
+#include "RISCVGenMacroFusion.inc"
+
 #define GET_SUBTARGETINFO_HEADER
 #include "RISCVGenSubtargetInfo.inc"
 
@@ -196,11 +199,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     return UserReservedRegister[i];
   }
 
-  bool hasMacroFusion() const {
-    return hasLUIADDIFusion() || hasAUIPCADDIFusion() || hasZExtHFusion() ||
-           hasZExtWFusion() || hasShiftedZExtWFusion() || hasLDADDFusion();
-  }
-
   // Vector codegen related methods.
   bool hasVInstructions() const { return HasStdExtZve32x; }
   bool hasVInstructionsI64() const { return HasStdExtZve64x; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b4b81b545a54bb..2285c99d790100 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -14,7 +14,6 @@
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "RISCV.h"
 #include "RISCVMachineFunctionInfo.h"
-#include "RISCVMacroFusion.h"
 #include "RISCVTargetObjectFile.h"
 #include "RISCVTargetTransformInfo.h"
 #include "TargetInfo/RISCVTargetInfo.h"
@@ -26,6 +25,8 @@
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MIRParser/MIParser.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/MacroFusion.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -361,9 +362,10 @@ class RISCVPassConfig : public TargetPassConfig {
       DAG->addMutation(createLoadClusterDAGMutation(
           DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
     }
-    if (ST.hasMacroFusion()) {
+    const auto &MacroFusions = ST.getMacroFusions();
+    if (!MacroFusions.empty()) {
       DAG = DAG ? DAG : createGenericSchedLive(C);
-      DAG->addMutation(createRISCVMacroFusionDAGMutation());
+      DAG->addMutation(createMacroFusionDAGMutation(MacroFusions));
     }
     return DAG;
   }
@@ -371,9 +373,10 @@ class RISCVPassConfig : public TargetPassConfig {
   ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const override {
     const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
-    if (ST.hasMacroFusion()) {
+    const auto &MacroFusions = ST.getMacroFusions();
+    if (!MacroFusions.empty()) {
       ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
-      DAG->addMutation(createRISCVMacroFusionDAGMutation());
+      DAG->addMutation(createMacroFusionDAGMutation(MacroFusions));
       return DAG;
     }
     return nullptr;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 866d5cf340e68b..66dab70d455ff4 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,6 +37,9 @@ static cl::opt<unsigned> SLPMaxVF(
 InstructionCost
 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
                                       TTI::TargetCostKind CostKind) {
+  // Check if the type is valid for all CostKind
+  if (!VT.isVector())
+    return InstructionCost::getInvalid();
   size_t NumInstr = OpCodes.size();
   if (CostKind == TTI::TCK_CodeSize)
     return NumInstr;
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 7b103395652433..38a59e650f33c7 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -72,6 +72,20 @@ def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true",
 //==== Features added predmoninantly for LEON subtarget support
 include "LeonFeatures.td"
 
+//==== Register allocation tweaks needed by some low-level software
+foreach i = 1 ... 7  in
+    def FeatureReserveG#i : SubtargetFeature<"reserve-g"#i, "ReserveRegister["#i#" + SP::G0]", "true",
+                                             "Reserve G"#i#", making it unavailable as a GPR">;
+foreach i = 0 ... 5 in
+    def FeatureReserveO#i : SubtargetFeature<"reserve-o"#i, "ReserveRegister["#i#" + SP::O0]", "true",
+                                             "Reserve O"#i#", making it unavailable as a GPR">;
+foreach i = 0 ... 7 in
+    def FeatureReserveL#i : SubtargetFeature<"reserve-l"#i, "ReserveRegister["#i#" + SP::L0]", "true",
+                                             "Reserve L"#i#", making it unavailable as a GPR">;
+foreach i = 0 ... 5 in
+    def FeatureReserveI#i : SubtargetFeature<"reserve-i"#i, "ReserveRegister["#i#" + SP::I0]", "true",
+                                             "Reserve I"#i#", making it unavailable as a GPR">;
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 215a8ea8319046..6855471840e9db 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -434,6 +434,50 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     default:
       // See if this is a generic print operand
       return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+    case 'L': // Low order register of a twin word register operand
+    case 'H': // High order register of a twin word register operand
+    {
+      const SparcSubtarget &Subtarget = MF->getSubtarget<SparcSubtarget>();
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      const SparcRegisterInfo *RegisterInfo = Subtarget.getRegisterInfo();
+      Register MOReg = MO.getReg();
+
+      Register HiReg, LoReg;
+      if (!SP::IntPairRegClass.contains(MOReg)) {
+        // If we aren't given a register pair already, find out which pair it
+        // belongs to. Note that here, the specified register operand, which
+        // refers to the high part of the twinword, needs to be an even-numbered
+        // register.
+        MOReg = RegisterInfo->getMatchingSuperReg(MOReg, SP::sub_even,
+                                                  &SP::IntPairRegClass);
+        if (!MOReg) {
+          SMLoc Loc;
+          OutContext.reportError(
+              Loc, "Hi part of pair should point to an even-numbered register");
+          OutContext.reportError(
+              Loc, "(note that in some cases it might be necessary to manually "
+                   "bind the input/output registers instead of relying on "
+                   "automatic allocation)");
+          return true;
+        }
+      }
+
+      HiReg = RegisterInfo->getSubReg(MOReg, SP::sub_even);
+      LoReg = RegisterInfo->getSubReg(MOReg, SP::sub_odd);
+
+      Register Reg;
+      switch (ExtraCode[0]) {
+      case 'L':
+        Reg = LoReg;
+        break;
+      case 'H':
+        Reg = HiReg;
+        break;
+      }
+
+      O << '%' << SparcInstPrinter::getRegisterName(Reg);
+      return false;
+    }
     case 'f':
     case 'r':
      break;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 78bdf3ae9a84ba..bdefb0841a124b 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "SparcISelLowering.h"
 #include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "SparcMachineFunctionInfo.h"
 #include "SparcRegisterInfo.h"
 #include "SparcTargetMachine.h"
@@ -28,6 +29,7 @@
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -729,6 +731,30 @@ SDValue SparcTargetLowering::LowerFormalArguments_64(
   return Chain;
 }
 
+// Check whether any of the argument registers are reserved
+static bool isAnyArgRegReserved(const SparcRegisterInfo *TRI,
+                                const MachineFunction &MF) {
+  // The register window design means that outgoing parameters at O*
+  // will appear in the callee as I*.
+  // Be conservative and check both sides of the register names.
+  bool Outgoing =
+      llvm::any_of(SP::GPROutgoingArgRegClass, [TRI, &MF](MCPhysReg r) {
+        return TRI->isReservedReg(MF, r);
+      });
+  bool Incoming =
+      llvm::any_of(SP::GPRIncomingArgRegClass, [TRI, &MF](MCPhysReg r) {
+        return TRI->isReservedReg(MF, r);
+      });
+  return Outgoing || Incoming;
+}
+
+static void emitReservedArgRegCallError(const MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  F.getContext().diagnose(DiagnosticInfoUnsupported{
+      F, ("SPARC doesn't support"
+          " function calls if any of the argument registers is reserved.")});
+}
+
 SDValue
 SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
@@ -805,6 +831,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   bool &isTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
+  MachineFunction &MF = DAG.getMachineFunction();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -1055,6 +1082,10 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
       ((hasReturnsTwice)
            ? TRI->getRTCallPreservedMask(CallConv)
            : TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv));
+
+  if (isAnyArgRegReserved(TRI, MF))
+    emitReservedArgRegCallError(MF);
+
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
@@ -1125,6 +1156,13 @@ Register SparcTargetLowering::getRegisterByName(const char* RegName, LLT VT,
     .Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7)
     .Default(0);
 
+  // If we're directly referencing register names
+  // (e.g in GCC C extension `register int r asm("g1");`),
+  // make sure that said register is in the reserve list.
+  const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  if (!TRI->isReservedReg(MF, Reg))
+    Reg = 0;
+
   if (Reg)
     return Reg;
 
@@ -1189,6 +1227,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
+  MachineFunction &MF = DAG.getMachineFunction();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -1372,6 +1411,10 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
       ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv)
                          : TRI->getCallPreservedMask(DAG.getMachineFunction(),
                                                      CLI.CallConv));
+
+  if (isAnyArgRegReserved(TRI, MF))
+    emitReservedArgRegCallError(MF);
+
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
index f97bf57627d1aa..71a27f77d2c6bf 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -12,10 +12,8 @@
 
 #include "SparcRegisterInfo.h"
 #include "Sparc.h"
-#include "SparcMachineFunctionInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -98,9 +96,21 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (unsigned n = 0; n < 31; n++)
     Reserved.set(SP::ASR1 + n);
 
+  for (TargetRegisterClass::iterator i = SP::IntRegsRegClass.begin();
+       i != SP::IntRegsRegClass.end(); ++i) {
+    if (MF.getSubtarget<SparcSubtarget>().isRegisterReserved(*i))
+      markSuperRegs(Reserved, *i);
+  }
+
+  assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
 
+bool SparcRegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      MCRegister Reg) const {
+  return getReservedRegs(MF)[Reg];
+}
+
 const TargetRegisterClass*
 SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                       unsigned Kind) const {
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/llvm/lib/Target/Sparc/SparcRegisterInfo.h
index 5b3c1a7ad07dd5..58c85f33635f2d 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -30,6 +30,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
+  bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const;
 
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
                                                 unsigned Kind) const override;
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/llvm/lib/Target/Sparc/SparcRegisterInfo.td
index d5ba7464695c5f..d8319a8d41dda0 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -370,6 +370,10 @@ def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>;
 // Floating point control register classes.
 def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
 
+// GPR argument registers.
+def GPROutgoingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "O%u", 0, 5)>;
+def GPRIncomingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "I%u", 0, 5)>;
+
 let isAllocatable = 0 in {
   // Ancillary state registers
   // FIXME: TICK is special-cased here as it can be accessed
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index 6b09904ca5e8e5..5b65e34e0f8a36 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -50,6 +50,7 @@ SparcSubtarget::SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU,
                                const StringRef &FS, const TargetMachine &TM,
                                bool is64Bit)
     : SparcGenSubtargetInfo(TM.getTargetTriple(), CPU, TuneCPU, FS),
+      ReserveRegister(TM.getMCRegisterInfo()->getNumRegs()),
       TargetTriple(TM.getTargetTriple()), Is64Bit(is64Bit),
       InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
       TLInfo(TM, *this), FrameLowering(*this) {}
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h
index cdb210f67482c4..fe4aca5195306a 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -13,12 +13,14 @@
 #ifndef LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
 #define LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
 
+#include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "SparcFrameLowering.h"
 #include "SparcISelLowering.h"
 #include "SparcInstrInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <string>
 
@@ -29,6 +31,10 @@ namespace llvm {
 class StringRef;
 
 class SparcSubtarget : public SparcGenSubtargetInfo {
+  // ReserveRegister[i] - Register #i is not available as a general purpose
+  // register.
+  BitVector ReserveRegister;
+
   Triple TargetTriple;
   virtual void anchor();
 
@@ -82,6 +88,10 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
     return is64Bit() ? 2047 : 0;
   }
 
+  bool isRegisterReserved(MCPhysReg PhysReg) const {
+    return ReserveRegister[PhysReg];
+  }
+
   /// Given a actual stack size as determined by FrameInfo, this function
   /// returns adjusted framesize which includes space for register window
   /// spills and arguments.
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index db19c8881c685a..80c994a32ea96a 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -840,8 +840,10 @@ void SystemZELFFrameLowering::inlineStackProbe(
   StackAllocMI->eraseFromParent();
   if (DoneMBB != nullptr) {
     // Compute the live-in lists for the new blocks.
-    recomputeLiveIns(*DoneMBB);
-    recomputeLiveIns(*LoopMBB);
+    bool anyChange = false;
+    do {
+      anyChange = recomputeLiveIns(*DoneMBB) || recomputeLiveIns(*LoopMBB);
+    } while (anyChange);
   }
 }
 
@@ -1439,8 +1441,10 @@ void SystemZXPLINKFrameLowering::inlineStackProbe(
   StackAllocMI->eraseFromParent();
 
   // Compute the live-in lists for the new blocks.
-  recomputeLiveIns(*NextMBB);
-  recomputeLiveIns(*StackExtMBB);
+  bool anyChange = false;
+  do {
+    anyChange = recomputeLiveIns(*StackExtMBB) || recomputeLiveIns(*NextMBB);
+  } while (anyChange);
 }
 
 bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 924df12578fe4b..5e0b0594b0a421 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1067,7 +1067,8 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (!isInt<20>(AM.BaseOffs))
     return false;
 
-  bool RequireD12 = Subtarget.hasVector() && Ty->isVectorTy();
+  bool RequireD12 =
+      Subtarget.hasVector() && (Ty->isVectorTy() || Ty->isIntegerTy(128));
   AddressingMode SupportedAM(!RequireD12, true);
   if (I != nullptr)
     SupportedAM = supportedAddressingMode(I, Subtarget.hasVector());
@@ -1922,7 +1923,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
         unsigned N = getNumRegistersForCallingConv(Ctx, CLI.CallConv, OrigArgVT);
         SlotVT = EVT::getIntegerVT(Ctx, PartVT.getSizeInBits() * N);
       } else {
-        SlotVT = Outs[I].ArgVT;
+        SlotVT = Outs[I].VT;
       }
       SDValue SpillSlot = DAG.CreateStackTemporary(SlotVT);
       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
@@ -4251,6 +4252,7 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
   if (N->getValueType(0) == MVT::i128) {
     unsigned BaseOp = 0;
     unsigned FlagOp = 0;
+    bool IsBorrow = false;
     switch (Op.getOpcode()) {
     default: llvm_unreachable("Unknown instruction!");
     case ISD::UADDO:
@@ -4260,6 +4262,7 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
     case ISD::USUBO:
       BaseOp = ISD::SUB;
       FlagOp = SystemZISD::VSCBI;
+      IsBorrow = true;
       break;
     }
     SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS);
@@ -4267,6 +4270,9 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
     Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag,
                        DAG.getValueType(MVT::i1));
     Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1));
+    if (IsBorrow)
+      Flag = DAG.getNode(ISD::XOR, DL, Flag.getValueType(),
+                         Flag, DAG.getConstant(1, DL, Flag.getValueType()));
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag);
   }
 
@@ -4339,6 +4345,7 @@ SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op,
   if (VT == MVT::i128) {
     unsigned BaseOp = 0;
     unsigned FlagOp = 0;
+    bool IsBorrow = false;
     switch (Op.getOpcode()) {
     default: llvm_unreachable("Unknown instruction!");
     case ISD::UADDO_CARRY:
@@ -4348,14 +4355,21 @@ SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op,
     case ISD::USUBO_CARRY:
       BaseOp = SystemZISD::VSBI;
       FlagOp = SystemZISD::VSBCBI;
+      IsBorrow = true;
       break;
     }
+    if (IsBorrow)
+      Carry = DAG.getNode(ISD::XOR, DL, Carry.getValueType(),
+                          Carry, DAG.getConstant(1, DL, Carry.getValueType()));
     Carry = DAG.getZExtOrTrunc(Carry, DL, MVT::i128);
     SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS, Carry);
     SDValue Flag = DAG.getNode(FlagOp, DL, MVT::i128, LHS, RHS, Carry);
     Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag,
                        DAG.getValueType(MVT::i1));
     Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1));
+    if (IsBorrow)
+      Flag = DAG.getNode(ISD::XOR, DL, Flag.getValueType(),
+                         Flag, DAG.getConstant(1, DL, Flag.getValueType()));
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag);
   }
 
@@ -6610,6 +6624,27 @@ SDValue SystemZTargetLowering::combineZERO_EXTEND(
       return NewSelect;
     }
   }
+  // Convert (zext (xor (trunc X), C)) into (xor (trunc X), C') if the size
+  // of the result is smaller than the size of X and all the truncated bits
+  // of X are already zero.
+  if (N0.getOpcode() == ISD::XOR &&
+      N0.hasOneUse() && N0.getOperand(0).hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (VT.isScalarInteger() && VT.getSizeInBits() < X.getValueSizeInBits()) {
+      KnownBits Known = DAG.computeKnownBits(X);
+      APInt TruncatedBits = APInt::getBitsSet(X.getValueSizeInBits(),
+                                              N0.getValueSizeInBits(),
+                                              VT.getSizeInBits());
+      if (TruncatedBits.isSubsetOf(Known.Zero)) {
+        X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
+        APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
+        return DAG.getNode(ISD::XOR, SDLoc(N0), VT,
+                           X, DAG.getConstant(Mask, SDLoc(N0), VT));
+      }
+    }
+  }
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 9f0fd4d0938e97..87ec8aa23080e0 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -877,7 +877,6 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
       OutStreamer->emitInt32(FeatureFlagsAnd);            // data
       emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
 
-      OutStreamer->endSection(Nt);
       OutStreamer->switchSection(Cur);
     }
   }
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index c0d358ead2787b..c2f76a3b8abbea 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -885,8 +885,10 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
   }
 
   // Update Live In information
-  recomputeLiveIns(*testMBB);
-  recomputeLiveIns(*tailMBB);
+  bool anyChange = false;
+  do {
+    anyChange = recomputeLiveIns(*tailMBB) || recomputeLiveIns(*testMBB);
+  } while (anyChange);
 }
 
 void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
@@ -1378,10 +1380,11 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
         footMBB->addSuccessor(&MBB);
       }
 
-      recomputeLiveIns(*headMBB);
-      recomputeLiveIns(*bodyMBB);
-      recomputeLiveIns(*footMBB);
-      recomputeLiveIns(MBB);
+      bool anyChange = false;
+      do {
+        anyChange = recomputeLiveIns(*footMBB) || recomputeLiveIns(*bodyMBB) ||
+                    recomputeLiveIns(*headMBB) || recomputeLiveIns(MBB);
+      } while (anyChange);
     }
   } else {
     MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 833f058253d880..553d338b77904a 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2923,11 +2923,10 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
 }
 
 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
-  // Cannot use 32 bit constants to reference objects in kernel code model.
-  // Cannot use 32 bit constants to reference objects in large PIC mode since
-  // GOTOFF is 64 bits.
+  // Cannot use 32 bit constants to reference objects in kernel/large code
+  // model.
   if (TM.getCodeModel() == CodeModel::Kernel ||
-      (TM.getCodeModel() == CodeModel::Large && TM.isPositionIndependent()))
+      TM.getCodeModel() == CodeModel::Large)
     return false;
 
   // In static codegen with small code model, we can get the address of a label
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e158312caffdec..71fc6b5047eaa9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18703,16 +18703,18 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget.isTargetDarwin()) {
     // Darwin only has one model of TLS.  Lower to that.
     unsigned char OpFlag = 0;
-    unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
-                           X86ISD::WrapperRIP : X86ISD::Wrapper;
+    unsigned WrapperKind = 0;
 
     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
     // global base reg.
     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
-    if (PIC32)
+    if (PIC32) {
       OpFlag = X86II::MO_TLVP_PIC_BASE;
-    else
+      WrapperKind = X86ISD::Wrapper;
+    } else {
       OpFlag = X86II::MO_TLVP;
+      WrapperKind = X86ISD::WrapperRIP;
+    }
     SDLoc DL(Op);
     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                 GA->getValueType(0),
@@ -47033,10 +47035,13 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
     return V;
 
-  // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
-  // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
-  // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
-  // depending on sign of (SarConst - [56,48,32,24,16])
+  // fold (SRA (SHL X, ShlConst), SraConst)
+  // into (SHL (sext_in_reg X), ShlConst - SraConst)
+  //   or (sext_in_reg X)
+  //   or (SRA (sext_in_reg X), SraConst - ShlConst)
+  // depending on relation between SraConst and ShlConst.
+  // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
+  // us to do the sext_in_reg from corresponding bit.
 
   // sexts in X86 are MOVs. The MOVs have the same code size
   // as above SHIFTs (only SHIFT on 1 has lower code size).
@@ -47052,29 +47057,29 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
   APInt ShlConst = N01->getAsAPIntVal();
-  APInt SarConst = N1->getAsAPIntVal();
+  APInt SraConst = N1->getAsAPIntVal();
   EVT CVT = N1.getValueType();
 
-  if (SarConst.isNegative())
+  if (CVT != N01.getValueType())
+    return SDValue();
+  if (SraConst.isNegative())
     return SDValue();
 
   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
     unsigned ShiftSize = SVT.getSizeInBits();
-    // skipping types without corresponding sext/zext and
-    // ShlConst that is not one of [56,48,32,24,16]
+    // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
       continue;
     SDLoc DL(N);
     SDValue NN =
         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
-    SarConst = SarConst - (Size - ShiftSize);
-    if (SarConst == 0)
+    if (SraConst.eq(ShlConst))
       return NN;
-    if (SarConst.isNegative())
+    if (SraConst.ult(ShlConst))
       return DAG.getNode(ISD::SHL, DL, VT, NN,
-                         DAG.getConstant(-SarConst, DL, CVT));
+                         DAG.getConstant(ShlConst - SraConst, DL, CVT));
     return DAG.getNode(ISD::SRA, DL, VT, NN,
-                       DAG.getConstant(SarConst, DL, CVT));
+                       DAG.getConstant(SraConst - ShlConst, DL, CVT));
   }
   return SDValue();
 }
@@ -47876,6 +47881,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
   SDValue X, Y;
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (SDValue Not = GetNot(N0)) {
     X = Not;
@@ -47889,9 +47895,11 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
   X = DAG.getBitcast(VT, X);
   Y = DAG.getBitcast(VT, Y);
   SDLoc DL(N);
+
   // We do not split for SSE at all, but we need to split vectors for AVX1 and
   // AVX2.
-  if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
+  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && 
+      TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) {
     SDValue LoX, HiX;
     std::tie(LoX, HiX) = splitVector(X, DAG, DL);
     SDValue LoY, HiY;
@@ -47901,7 +47909,11 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
     SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
   }
-  return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
+
+  if (TLI.isTypeLegal(VT))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
+
+  return SDValue();
 }
 
 // Try to widen AND, OR and XOR nodes to VT in order to remove casts around
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index fe7d90fbcdf707..bb5e22c7142793 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12422,7 +12422,7 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
            : avx512_3Op_rm_imm8<Op, OpStr, OpNode, sched, VTI, VTI> {
   let ExeDomain = VTI.ExeDomain in
   defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
-                (ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3),
+                (ins VTI.RC:$src1, BcstVTI.ScalarMemOp:$src2, u8imm:$src3),
                 OpStr, "$src3, ${src2}"#BcstVTI.BroadcastStr#", $src1",
                 "$src1, ${src2}"#BcstVTI.BroadcastStr#", $src3",
                 (OpNode (VTI.VT VTI.RC:$src1),
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index c9d0f66c6e46b3..63136af2295f4b 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -291,12 +291,15 @@ struct X86BroadcastFoldTable {
 static bool matchBroadcastSize(const X86FoldTableEntry &Entry,
                                unsigned BroadcastBits) {
   switch (Entry.Flags & TB_BCAST_MASK) {
-  case TB_BCAST_SD:
-  case TB_BCAST_Q:
-    return BroadcastBits == 64;
-  case TB_BCAST_SS:
+  case TB_BCAST_W:
+  case TB_BCAST_SH:
+    return BroadcastBits == 16;
   case TB_BCAST_D:
+  case TB_BCAST_SS:
     return BroadcastBits == 32;
+  case TB_BCAST_Q:
+  case TB_BCAST_SD:
+    return BroadcastBits == 64;
   }
   return false;
 }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index d6f9aa6d6acec0..9ac1f783b7f0a1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2354,33 +2354,26 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VBLENDPSrri:
     // If we're optimizing for size, try to use MOVSD/MOVSS.
     if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
-      unsigned Mask;
-      switch (Opc) {
-      default:
-        llvm_unreachable("Unreachable!");
-      case X86::BLENDPDrri:
-        Opc = X86::MOVSDrr;
-        Mask = 0x03;
-        break;
-      case X86::BLENDPSrri:
-        Opc = X86::MOVSSrr;
-        Mask = 0x0F;
-        break;
-      case X86::VBLENDPDrri:
-        Opc = X86::VMOVSDrr;
-        Mask = 0x03;
-        break;
-      case X86::VBLENDPSrri:
-        Opc = X86::VMOVSSrr;
-        Mask = 0x0F;
-        break;
-      }
+      unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F;
       if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
+#define FROM_TO(FROM, TO)                                                      \
+  case X86::FROM:                                                              \
+    Opc = X86::TO;                                                             \
+    break;
+        switch (Opc) {
+        default:
+          llvm_unreachable("Unreachable!");
+        FROM_TO(BLENDPDrri, MOVSDrr)
+        FROM_TO(BLENDPSrri, MOVSSrr)
+        FROM_TO(VBLENDPDrri, VMOVSDrr)
+        FROM_TO(VBLENDPSrri, VMOVSSrr)
+        }
         WorkingMI = CloneIfNew(MI);
         WorkingMI->setDesc(get(Opc));
         WorkingMI->removeOperand(3);
         break;
       }
+#undef FROM_TO
     }
     [[fallthrough]];
   case X86::PBLENDWrri:
diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td
index bbd19cf8d5b25e..461b2badc13134 100644
--- a/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -83,6 +83,7 @@ defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v8f16, VR256, v16f16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8bf16, VR256, v16bf16, sub_xmm>;
 
 // A 128-bit subvector extract from the first 512-bit vector position is a
 // subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -95,6 +96,7 @@ defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v8f16, VR512, v32f16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8bf16, VR512, v32bf16, sub_xmm>;
 
 // A 128-bit subvector extract from the first 512-bit vector position is a
 // subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -107,6 +109,7 @@ defm : subvector_subreg_lowering<VR256, v4f64,  VR512, v8f64,  sub_ymm>;
 defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
 defm : subvector_subreg_lowering<VR256, v32i8,  VR512, v64i8,  sub_ymm>;
 defm : subvector_subreg_lowering<VR256, v16f16, VR512, v32f16, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v16bf16, VR512, v32bf16, sub_ymm>;
 
 
 // If we're inserting into an all zeros vector, just use a plain move which
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 58ebe023cd61ec..7ce0aa22b99795 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -959,8 +959,10 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
   SmallString<256> Code;
   unsigned MinSize = MI.getOperand(0).getImm();
 
-  if (NextMI != MI.getParent()->end()) {
+  if (NextMI != MI.getParent()->end() && !NextMI->isInlineAsm()) {
     // Lower the next MachineInstr to find its byte size.
+    // If the next instruction is inline assembly, we skip lowering it for now,
+    // and assume we should always generate NOPs.
     MCInst MCI;
     MCIL.Lower(&*NextMI, MCI);
 
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a458b5f9ec8fbb..4d55a084b730e4 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -244,7 +244,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   // TODO: Currently we're always allowing widening on CPUs without VLX,
   // because for many cases we don't have a better option.
   bool canExtendTo512DQ() const {
-    return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
+    return hasAVX512() && hasEVEX512() &&
+           (!hasVLX() || getPreferVectorWidth() >= 512);
   }
   bool canExtendTo512BW() const  {
     return hasBWI() && canExtendTo512DQ();
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index cd40b1d3b09332..be774a89eccbb4 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6080,6 +6080,10 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
 
   for (const Instruction &I : instructions(Callee)) {
     if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      // Having more target features is fine for inline ASM.
+      if (CB->isInlineAsm())
+        continue;
+
       SmallVector<Type *, 8> Types;
       for (Value *Arg : CB->args())
         Types.push_back(Arg->getType());
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index f1197c29655380..1adef15771fa17 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -321,6 +321,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
     return StringSwitch<const char *>(Part)
         .Case("0xac3", "ampere1")
         .Case("0xac4", "ampere1a")
+        .Case("0xac5", "ampere1b")
         .Default("generic");
   }
 
@@ -1265,8 +1266,10 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(X86::FEATURE_AVX2);
   if (HasLeaf7 && ((EBX >> 8) & 1))
     setFeature(X86::FEATURE_BMI2);
-  if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
+  if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save) {
     setFeature(X86::FEATURE_AVX512F);
+    setFeature(X86::FEATURE_EVEX512);
+  }
   if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
     setFeature(X86::FEATURE_AVX512DQ);
   if (HasLeaf7 && ((EBX >> 19) & 1))
@@ -1771,6 +1774,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["rtm"]        = HasLeaf7 && ((EBX >> 11) & 1);
   // AVX512 is only supported if the OS supports the context save for it.
   Features["avx512f"]    = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
+  Features["evex512"]    = Features["avx512f"];
   Features["avx512dq"]   = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save;
   Features["rdseed"]     = HasLeaf7 && ((EBX >> 18) & 1);
   Features["adx"]        = HasLeaf7 && ((EBX >> 19) & 1);
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 3cbe974ff31421..20f324604aa52a 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -294,6 +294,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["gfx12-insts"] = true;
       Features["atomic-fadd-rtn-insts"] = true;
       Features["image-insts"] = true;
+      Features["fp8-conversion-insts"] = true;
       break;
     case GK_GFX1151:
     case GK_GFX1150:
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 8058282c422503..062a3d341007ce 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -652,10 +652,6 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
   // check to see if the pointer is guaranteed to not be modified from entry of
   // the function to each of the load instructions.
 
-  // Because there could be several/many load instructions, remember which
-  // blocks we know to be transparent to the load.
-  df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
-
   for (LoadInst *Load : Loads) {
     // Check to see if the load is invalidated from the start of the block to
     // the load itself.
@@ -669,7 +665,7 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
     // To do this, we perform a depth first search on the inverse CFG from the
     // loading block.
     for (BasicBlock *P : predecessors(BB)) {
-      for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
+      for (BasicBlock *TranspBB : inverse_depth_first(P))
         if (AAR.canBasicBlockModify(*TranspBB, Loc))
           return false;
     }
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 733f290b1bc93a..633fcb3314c42f 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -470,6 +470,9 @@ class LowerTypeTestsModule {
 
   Function *WeakInitializerFn = nullptr;
 
+  GlobalVariable *GlobalAnnotation;
+  DenseSet<Value *> FunctionAnnotations;
+
   bool shouldExportConstantsAsAbsoluteSymbols();
   uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
   TypeIdLowering importTypeId(StringRef TypeId);
@@ -531,6 +534,10 @@ class LowerTypeTestsModule {
   /// replace each use, which is a direct function call.
   void replaceDirectCalls(Value *Old, Value *New);
 
+  bool isFunctionAnnotation(Value *V) const {
+    return FunctionAnnotations.contains(V);
+  }
+
 public:
   LowerTypeTestsModule(Module &M, ModuleAnalysisManager &AM,
                        ModuleSummaryIndex *ExportSummary,
@@ -1377,8 +1384,11 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
   // (all?) targets. Switch to a runtime initializer.
   SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
   findGlobalVariableUsersOf(F, GlobalVarUsers);
-  for (auto *GV : GlobalVarUsers)
+  for (auto *GV : GlobalVarUsers) {
+    if (GV == GlobalAnnotation)
+      continue;
     moveInitializerToModuleConstructor(GV);
+  }
 
   // Can not RAUW F with an expression that uses F. Replace with a temporary
   // placeholder first.
@@ -1837,6 +1847,16 @@ LowerTypeTestsModule::LowerTypeTestsModule(
   }
   OS = TargetTriple.getOS();
   ObjectFormat = TargetTriple.getObjectFormat();
+
+  // Function annotation describes or applies to function itself, and
+  // shouldn't be associated with jump table thunk generated for CFI.
+  GlobalAnnotation = M.getGlobalVariable("llvm.global.annotations");
+  if (GlobalAnnotation && GlobalAnnotation->hasInitializer()) {
+    const ConstantArray *CA =
+        cast<ConstantArray>(GlobalAnnotation->getInitializer());
+    for (Value *Op : CA->operands())
+      FunctionAnnotations.insert(Op);
+  }
 }
 
 bool LowerTypeTestsModule::runForTesting(Module &M, ModuleAnalysisManager &AM) {
@@ -1896,10 +1916,14 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
     if (isa<BlockAddress, NoCFIValue>(U.getUser()))
       continue;
 
-    // Skip direct calls to externally defined or non-dso_local functions
+    // Skip direct calls to externally defined or non-dso_local functions.
     if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical))
       continue;
 
+    // Skip function annotation.
+    if (isFunctionAnnotation(U.getUser()))
+      continue;
+
     // Must handle Constants specially, we cannot call replaceUsesOfWith on a
     // constant because they are uniqued.
     if (auto *C = dyn_cast<Constant>(U.getUser())) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a647be2d26c761..bc43edb5e62065 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -412,11 +412,14 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
   if (auto *SplatPtr = getSplatValue(II.getArgOperand(1))) {
     // scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr
     if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) {
-      Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
-      StoreInst *S =
-          new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false, Alignment);
-      S->copyMetadata(II);
-      return S;
+      if (maskContainsAllOneOrUndef(ConstMask)) {
+        Align Alignment =
+            cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+        StoreInst *S = new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false,
+                                     Alignment);
+        S->copyMetadata(II);
+        return S;
+      }
     }
     // scatter(vector, splat(ptr), splat(true)) -> store extract(vector,
     // lastlane), ptr
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 58f0763bb0c0cd..c5d3f60176a826 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2156,14 +2156,14 @@ static bool collectInsertionElements(Value *V, unsigned Shift,
     Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
 
     for (unsigned i = 0; i != NumElts; ++i) {
-      unsigned ShiftI = Shift + i * ElementSize;
+      unsigned ShiftI = i * ElementSize;
       Constant *Piece = ConstantFoldBinaryInstruction(
           Instruction::LShr, C, ConstantInt::get(C->getType(), ShiftI));
       if (!Piece)
         return false;
 
       Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
-      if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy,
+      if (!collectInsertionElements(Piece, ShiftI + Shift, Elements, VecEltTy,
                                     isBigEndian))
         return false;
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8c0fd662255130..9973a80a7db946 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6491,6 +6491,13 @@ InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
       if (!SafeReplacementConstant)
         SafeReplacementConstant = CI;
     }
+  } else if (isa<VectorType>(C->getType())) {
+    // Handle scalable splat
+    Value *SplatC = C->getSplatValue();
+    auto *CI = dyn_cast_or_null<ConstantInt>(SplatC);
+    // Bail out if the constant can't be safely incremented/decremented.
+    if (!CI || !ConstantIsOk(CI))
+      return std::nullopt;
   } else {
     // ConstantExpr?
     return std::nullopt;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index bb2a77daa60a76..1254a050027a45 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1032,7 +1032,8 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
   // where there are several consecutive memory accesses to the same location,
   // separated by a few arithmetic operations.
   bool IsLoadCSE = false;
-  if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE)) {
+  BatchAAResults BatchAA(*AA);
+  if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) {
     if (IsLoadCSE)
       combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 21bfc91148bfeb..8cc7901cbac7fa 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1284,7 +1284,11 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
       isGuaranteedNotToBeUndefOrPoison(CmpRHS, SQ.AC, &Sel, &DT)) {
     if (Value *V = simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ,
                                           /* AllowRefinement */ true))
-      return replaceOperand(Sel, Swapped ? 2 : 1, V);
+      // Require either the replacement or the simplification result to be a
+      // constant to avoid infinite loops.
+      // FIXME: Make this check more precise.
+      if (isa<Constant>(CmpRHS) || isa<Constant>(V))
+        return replaceOperand(Sel, Swapped ? 2 : 1, V);
 
     // Even if TrueVal does not simplify, we can directly replace a use of
     // CmpLHS with CmpRHS, as long as the instruction is not used anywhere
@@ -1302,7 +1306,8 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
       isGuaranteedNotToBeUndefOrPoison(CmpLHS, SQ.AC, &Sel, &DT))
     if (Value *V = simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ,
                                           /* AllowRefinement */ true))
-      return replaceOperand(Sel, Swapped ? 2 : 1, V);
+      if (isa<Constant>(CmpLHS) || isa<Constant>(V))
+        return replaceOperand(Sel, Swapped ? 2 : 1, V);
 
   auto *FalseInst = dyn_cast<Instruction>(FalseVal);
   if (!FalseInst)
@@ -2601,7 +2606,7 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC,
   // %cnd = icmp slt i32 %rem, 0
   // %add = add i32 %rem, %n
   // %sel = select i1 %cnd, i32 %add, i32 %rem
-  if (match(TrueVal, m_Add(m_Value(RemRes), m_Value(Remainder))) &&
+  if (match(TrueVal, m_Add(m_Specific(RemRes), m_Value(Remainder))) &&
       match(RemRes, m_SRem(m_Value(Op), m_Specific(Remainder))) &&
       IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero*/ true) &&
       FalseVal == RemRes)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a8a5f9831e15e3..79873a9b4cbb4c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -802,6 +802,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         return InsertNewInstWith(LShr, I->getIterator());
       } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.
         Known.One |= HighBits;
+        // SignBits may be out-of-sync with Known.countMinSignBits(). Mask out
+        // high bits of Known.Zero to avoid conflicts.
+        Known.Zero &= ~HighBits;
       }
     } else {
       computeKnownBits(I, Known, Depth, CxtI);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 249f4a7710e046..6f0cf9d9c8f187 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1455,6 +1455,7 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
                                              Value *NewOp, InstCombiner &IC) {
   Instruction *Clone = I.clone();
   Clone->replaceUsesOfWith(SI, NewOp);
+  Clone->dropUBImplyingAttrsAndMetadata();
   IC.InsertNewInstBefore(Clone, SI->getIterator());
   return Clone;
 }
@@ -2594,10 +2595,10 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         Value *V;
         if ((has_single_bit(TyAllocSize) &&
              match(GEP.getOperand(1),
-                   m_Exact(m_AShr(m_Value(V),
-                                  m_SpecificInt(countr_zero(TyAllocSize)))))) ||
+                   m_Exact(m_Shr(m_Value(V),
+                                 m_SpecificInt(countr_zero(TyAllocSize)))))) ||
             match(GEP.getOperand(1),
-                  m_Exact(m_SDiv(m_Value(V), m_SpecificInt(TyAllocSize))))) {
+                  m_Exact(m_IDiv(m_Value(V), m_SpecificInt(TyAllocSize))))) {
           GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
               Builder.getInt8Ty(), GEP.getPointerOperand(), V);
           NewGEP->setIsInBounds(GEP.isInBounds());
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 8ee0bca7e354f0..0f42ff79086994 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -752,11 +752,12 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     const unsigned ByteSize = 1U << Idx;
     const unsigned BitSize = ByteSize * 8;
     Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
-    Value *Args[] = {Addr,
-                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+    Value *Val = RMWI->getValOperand();
+    Value *Args[] = {Addr, IRB.CreateBitOrPointerCast(Val, Ty),
                      createOrdering(&IRB, RMWI->getOrdering())};
-    CallInst *C = CallInst::Create(F, Args);
-    ReplaceInstWithInst(I, C);
+    Value *C = IRB.CreateCall(F, Args);
+    I->replaceAllUsesWith(IRB.CreateBitOrPointerCast(C, Val->getType()));
+    I->eraseFromParent();
   } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
     Value *Addr = CASI->getPointerOperand();
     Type *OrigOldValTy = CASI->getNewValOperand()->getType();
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 8f09569d0d9cc9..7b672e89b67aae 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1061,11 +1061,16 @@ void State::addInfoFor(BasicBlock &BB) {
           FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I)));
       break;
     // Enqueue the intrinsics to add extra info.
-    case Intrinsic::abs:
     case Intrinsic::umin:
     case Intrinsic::umax:
     case Intrinsic::smin:
     case Intrinsic::smax:
+      // TODO: Check if it is possible to instead only added the min/max facts
+      // when simplifying uses of the min/max intrinsics.
+      if (!isGuaranteedNotToBePoison(&I))
+        break;
+      [[fallthrough]];
+    case Intrinsic::abs:
       WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I));
       break;
     }
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 11a91bfbe5baff..380d6583655367 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -857,6 +857,9 @@ struct DSEState {
   // no longer be captured.
   bool ShouldIterateEndOfFunctionDSE;
 
+  /// Dead instructions to be removed at the end of DSE.
+  SmallVector<Instruction *> ToRemove;
+
   // Class contains self-reference, make sure it's not copied/moved.
   DSEState(const DSEState &) = delete;
   DSEState &operator=(const DSEState &) = delete;
@@ -1692,7 +1695,8 @@ struct DSEState {
     return {MaybeDeadAccess};
   }
 
-  // Delete dead memory defs
+  /// Delete dead memory defs and recursively add their operands to ToRemove if
+  /// they became dead.
   void deleteDeadInstruction(Instruction *SI) {
     MemorySSAUpdater Updater(&MSSA);
     SmallVector<Instruction *, 32> NowDeadInsts;
@@ -1708,8 +1712,11 @@ struct DSEState {
       salvageKnowledge(DeadInst);
 
       // Remove the Instruction from MSSA.
-      if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
-        if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
+      MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst);
+      bool IsMemDef = MA && isa<MemoryDef>(MA);
+      if (MA) {
+        if (IsMemDef) {
+          auto *MD = cast<MemoryDef>(MA);
           SkipStores.insert(MD);
           if (auto *SI = dyn_cast<StoreInst>(MD->getMemoryInst())) {
             if (SI->getValueOperand()->getType()->isPointerTy()) {
@@ -1730,13 +1737,21 @@ struct DSEState {
       // Remove its operands
       for (Use &O : DeadInst->operands())
         if (Instruction *OpI = dyn_cast<Instruction>(O)) {
-          O = nullptr;
+          O.set(PoisonValue::get(O->getType()));
           if (isInstructionTriviallyDead(OpI, &TLI))
             NowDeadInsts.push_back(OpI);
         }
 
       EI.removeInstruction(DeadInst);
-      DeadInst->eraseFromParent();
+      // Remove memory defs directly if they don't produce results, but only
+      // queue other dead instructions for later removal. They may have been
+      // used as memory locations that have been cached by BatchAA. Removing
+      // them here may lead to newly created instructions to be allocated at the
+      // same address, yielding stale cache entries.
+      if (IsMemDef && DeadInst->getType()->isVoidTy())
+        DeadInst->eraseFromParent();
+      else
+        ToRemove.push_back(DeadInst);
     }
   }
 
@@ -1892,15 +1907,15 @@ struct DSEState {
                               Malloc->getArgOperand(0), IRB, TLI);
     if (!Calloc)
       return false;
+
     MemorySSAUpdater Updater(&MSSA);
     auto *NewAccess =
       Updater.createMemoryAccessAfter(cast<Instruction>(Calloc), nullptr,
                                       MallocDef);
     auto *NewAccessMD = cast<MemoryDef>(NewAccess);
     Updater.insertDef(NewAccessMD, /*RenameUses=*/true);
-    Updater.removeMemoryAccess(Malloc);
     Malloc->replaceAllUsesWith(Calloc);
-    Malloc->eraseFromParent();
+    deleteDeadInstruction(Malloc);
     return true;
   }
 
@@ -2233,6 +2248,12 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
 
   MadeChange |= State.eliminateRedundantStoresOfExistingValues();
   MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
+
+  while (!State.ToRemove.empty()) {
+    Instruction *DeadInst = State.ToRemove.pop_back_val();
+    DeadInst->eraseFromParent();
+  }
+
   return MadeChange;
 }
 } // end anonymous namespace
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 9df28747570c4d..104e8ceb796700 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -279,6 +279,9 @@ bool InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
   Value *LHS = ICI->getOperand(0);
   Value *RHS = ICI->getOperand(1);
 
+  if (!LHS->getType()->isIntegerTy())
+    return false;
+
   // Canonicalize to the `Index Pred Invariant` comparison
   if (IsLoopInvariant(LHS)) {
     std::swap(LHS, RHS);
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 8603c5cf9c022c..87c01ead634ff8 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1260,8 +1260,11 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
   // the entry to its block.
   BasicBlock::iterator BBIt(LoadI);
   bool IsLoadCSE;
+  BatchAAResults BatchAA(*AA);
+  // The dominator tree is updated lazily and may not be valid at this point.
+  BatchAA.disableDominatorTree();
   if (Value *AvailableVal = FindAvailableLoadedValue(
-          LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+          LoadI, LoadBB, BBIt, DefMaxInstsToScan, &BatchAA, &IsLoadCSE)) {
     // If the value of the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
 
@@ -1322,9 +1325,9 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     MemoryLocation Loc(LoadedPtr->DoPHITranslation(LoadBB, PredBB),
                        LocationSize::precise(DL.getTypeStoreSize(AccessTy)),
                        AATags);
-    PredAvailable = findAvailablePtrLoadStore(Loc, AccessTy, LoadI->isAtomic(),
-                                              PredBB, BBIt, DefMaxInstsToScan,
-                                              AA, &IsLoadCSE, &NumScanedInst);
+    PredAvailable = findAvailablePtrLoadStore(
+        Loc, AccessTy, LoadI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+        &BatchAA, &IsLoadCSE, &NumScanedInst);
 
     // If PredBB has a single predecessor, continue scanning through the
     // single predecessor.
@@ -1336,7 +1339,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
         BBIt = SinglePredBB->end();
         PredAvailable = findAvailablePtrLoadStore(
             Loc, AccessTy, LoadI->isAtomic(), SinglePredBB, BBIt,
-            (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+            (DefMaxInstsToScan - NumScanedInst), &BatchAA, &IsLoadCSE,
             &NumScanedInst);
       }
     }
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index bdbaf4f55c96d0..17a94f9381bf8e 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2257,6 +2257,41 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
   return nullptr;
 }
 
+static VectorType *createAndCheckVectorTypesForPromotion(
+    SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
+    function_ref<void(Type *)> CheckCandidateType, Partition &P,
+    const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
+    bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
+    bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) {
+  [[maybe_unused]] VectorType *OriginalElt =
+      CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
+  // Consider additional vector types where the element type size is a
+  // multiple of load/store element size.
+  for (Type *Ty : OtherTys) {
+    if (!VectorType::isValidElementType(Ty))
+      continue;
+    unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
+    // Make a copy of CandidateTys and iterate through it, because we
+    // might append to CandidateTys in the loop.
+    for (VectorType *const VTy : CandidateTysCopy) {
+      // The elements in the copy should remain invariant throughout the loop
+      assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
+      unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
+      unsigned ElementSize =
+          DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
+      if (TypeSize != VectorSize && TypeSize != ElementSize &&
+          VectorSize % TypeSize == 0) {
+        VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
+        CheckCandidateType(NewVTy);
+      }
+    }
+  }
+
+  return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy,
+                                      CommonEltTy, HaveVecPtrTy,
+                                      HaveCommonVecPtrTy, CommonVecPtrTy);
+}
+
 /// Test whether the given alloca partitioning and range of slices can be
 /// promoted to a vector.
 ///
@@ -2271,6 +2306,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   // we have different element types.
   SmallVector<VectorType *, 4> CandidateTys;
   SetVector<Type *> LoadStoreTys;
+  SetVector<Type *> DeferredTys;
   Type *CommonEltTy = nullptr;
   VectorType *CommonVecPtrTy = nullptr;
   bool HaveVecPtrTy = false;
@@ -2314,42 +2350,32 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
       Ty = SI->getValueOperand()->getType();
     else
       continue;
+
+    auto CandTy = Ty->getScalarType();
+    if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
+                                  S.endOffset() != P.endOffset())) {
+      DeferredTys.insert(Ty);
+      continue;
+    }
+
     LoadStoreTys.insert(Ty);
     // Consider any loads or stores that are the exact size of the slice.
     if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
       CheckCandidateType(Ty);
   }
 
-  if (auto *VTy = checkVectorTypesForPromotion(
-          P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
+  SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
+  if (auto *VTy = createAndCheckVectorTypesForPromotion(
+          LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
+          CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
           HaveCommonVecPtrTy, CommonVecPtrTy))
     return VTy;
 
-  // Consider additional vector types where the element type size is a
-  // multiple of load/store element size.
-  for (Type *Ty : LoadStoreTys) {
-    if (!VectorType::isValidElementType(Ty))
-      continue;
-    unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
-    // Make a copy of CandidateTys and iterate through it, because we might
-    // append to CandidateTys in the loop.
-    SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
-    CandidateTys.clear();
-    for (VectorType *&VTy : CandidateTysCopy) {
-      unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
-      unsigned ElementSize =
-          DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
-      if (TypeSize != VectorSize && TypeSize != ElementSize &&
-          VectorSize % TypeSize == 0) {
-        VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
-        CheckCandidateType(NewVTy);
-      }
-    }
-  }
-
-  return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy,
-                                      CommonEltTy, HaveVecPtrTy,
-                                      HaveCommonVecPtrTy, CommonVecPtrTy);
+  CandidateTys.clear();
+  return createAndCheckVectorTypesForPromotion(
+      DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
+      HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
+      CommonVecPtrTy);
 }
 
 /// Test whether a slice of an alloca is valid for integer widening.
diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 1925b91c4da7ec..c5cb3748a52f8d 100644
--- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -407,6 +407,10 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
 /// form, by inverting the condition and the branch successors. The same
 /// approach goes for the opposite case.
 bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
+  // We cannot merge the if-region if the merge point has phi nodes.
+  if (isa<PHINode>(BB->front()))
+    return false;
+
   BasicBlock *IfTrue2, *IfFalse2;
   BranchInst *DomBI2 = GetIfCondition(BB, IfTrue2, IfFalse2);
   if (!DomBI2)
@@ -493,16 +497,6 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   PBI->replaceUsesOfWith(PBI->getCondition(), NC);
   Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
 
-  // Handle PHI node to replace its predecessors to FirstEntryBlock.
-  for (BasicBlock *Succ : successors(PBI)) {
-    for (PHINode &Phi : Succ->phis()) {
-      for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) {
-        if (Phi.getIncomingBlock(i) == SecondEntryBlock)
-          Phi.setIncomingBlock(i, FirstEntryBlock);
-      }
-    }
-  }
-
   // Remove IfTrue1
   if (IfTrue1 != FirstEntryBlock) {
     IfTrue1->dropAllReferences();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 459e3d98059283..a1c6bbc52fd05e 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3369,11 +3369,17 @@ void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
 
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
+  WithOverflowInst *UnusedWO;
+  // When replacing the result of a llvm.*.with.overflow intrinsic with a
+  // overflowing binary operator, nuw/nsw flags may no longer hold.
+  if (isa<OverflowingBinaryOperator>(ReplInst) &&
+      match(I, m_ExtractValue<0>(m_WithOverflowInst(UnusedWO))))
+    ReplInst->dropPoisonGeneratingFlags();
   // Note that if 'I' is a load being replaced by some operation,
   // for example, by an arithmetic operation, then andIRFlags()
   // would just erase all math flags from the original arithmetic
   // operation, which is clearly not wanted and not needed.
-  if (!isa<LoadInst>(I))
+  else if (!isa<LoadInst>(I))
     ReplInst->andIRFlags(I);
 
   // FIXME: If both the original and replacement value are part of the
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index a1d7f0f9ba0f74..a3951fdf8a1589 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1366,61 +1366,6 @@ Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) {
   return V;
 }
 
-static bool
-canReuseInstruction(ScalarEvolution &SE, const SCEV *S, Instruction *I,
-                    SmallVectorImpl<Instruction *> &DropPoisonGeneratingInsts) {
-  // If the instruction cannot be poison, it's always safe to reuse.
-  if (programUndefinedIfPoison(I))
-    return true;
-
-  // Otherwise, it is possible that I is more poisonous that S. Collect the
-  // poison-contributors of S, and then check whether I has any additional
-  // poison-contributors. Poison that is contributed through poison-generating
-  // flags is handled by dropping those flags instead.
-  SmallPtrSet<const Value *, 8> PoisonVals;
-  SE.getPoisonGeneratingValues(PoisonVals, S);
-
-  SmallVector<Value *> Worklist;
-  SmallPtrSet<Value *, 8> Visited;
-  Worklist.push_back(I);
-  while (!Worklist.empty()) {
-    Value *V = Worklist.pop_back_val();
-    if (!Visited.insert(V).second)
-      continue;
-
-    // Avoid walking large instruction graphs.
-    if (Visited.size() > 16)
-      return false;
-
-    // Either the value can't be poison, or the S would also be poison if it
-    // is.
-    if (PoisonVals.contains(V) || isGuaranteedNotToBePoison(V))
-      continue;
-
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      return false;
-
-    // FIXME: Ignore vscale, even though it technically could be poison. Do this
-    // because SCEV currently assumes it can't be poison. Remove this special
-    // case once we proper model when vscale can be poison.
-    if (auto *II = dyn_cast<IntrinsicInst>(I);
-        II && II->getIntrinsicID() == Intrinsic::vscale)
-      continue;
-
-    if (canCreatePoison(cast<Operator>(I), /*ConsiderFlagsAndMetadata*/ false))
-      return false;
-
-    // If the instruction can't create poison, we can recurse to its operands.
-    if (I->hasPoisonGeneratingFlagsOrMetadata())
-      DropPoisonGeneratingInsts.push_back(I);
-
-    for (Value *Op : I->operands())
-      Worklist.push_back(Op);
-  }
-  return true;
-}
-
 Value *SCEVExpander::FindValueInExprValueMap(
     const SCEV *S, const Instruction *InsertPt,
     SmallVectorImpl<Instruction *> &DropPoisonGeneratingInsts) {
@@ -1448,7 +1393,7 @@ Value *SCEVExpander::FindValueInExprValueMap(
       continue;
 
     // Make sure reusing the instruction is poison-safe.
-    if (canReuseInstruction(SE, S, EntInst, DropPoisonGeneratingInsts))
+    if (SE.canReuseInstruction(S, EntInst, DropPoisonGeneratingInsts))
       return V;
     DropPoisonGeneratingInsts.clear();
   }
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 0ed3324a27b6c9..1b142f14d81139 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -713,8 +714,11 @@ bool SimplifyIndvar::replaceFloatIVWithIntegerIV(Instruction *UseInst) {
 bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
                                            Instruction *IVOperand) {
   if (!SE->isSCEVable(UseInst->getType()) ||
-      (UseInst->getType() != IVOperand->getType()) ||
-      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
+      UseInst->getType() != IVOperand->getType())
+    return false;
+
+  const SCEV *UseSCEV = SE->getSCEV(UseInst);
+  if (UseSCEV != SE->getSCEV(IVOperand))
     return false;
 
   // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the
@@ -742,6 +746,16 @@ bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
   if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand))
     return false;
 
+  // Make sure the operand is not more poisonous than the instruction.
+  if (!impliesPoison(IVOperand, UseInst)) {
+    SmallVector<Instruction *> DropPoisonGeneratingInsts;
+    if (!SE->canReuseInstruction(UseSCEV, IVOperand, DropPoisonGeneratingInsts))
+      return false;
+
+    for (Instruction *I : DropPoisonGeneratingInsts)
+      I->dropPoisonGeneratingFlagsAndMetadata();
+  }
+
   LLVM_DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
 
   SE->forgetValue(UseInst);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ca93e15719fb2..dd596c567cd482 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1957,6 +1957,8 @@ class GeneratedRTChecks {
   bool CostTooHigh = false;
   const bool AddBranchWeights;
 
+  Loop *OuterLoop = nullptr;
+
 public:
   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
                     TargetTransformInfo *TTI, const DataLayout &DL,
@@ -2053,6 +2055,9 @@ class GeneratedRTChecks {
       DT->eraseNode(SCEVCheckBlock);
       LI->removeBlock(SCEVCheckBlock);
     }
+
+    // Outer loop is used as part of the later cost calculations.
+    OuterLoop = L->getParentLoop();
   }
 
   InstructionCost getCost() {
@@ -2076,16 +2081,61 @@ class GeneratedRTChecks {
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
-    if (MemCheckBlock)
+    if (MemCheckBlock) {
+      InstructionCost MemCheckCost = 0;
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
         InstructionCost C =
             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-        RTCheckCost += C;
+        MemCheckCost += C;
       }
 
+      // If the runtime memory checks are being created inside an outer loop
+      // we should find out if these checks are outer loop invariant. If so,
+      // the checks will likely be hoisted out and so the effective cost will
+      // reduce according to the outer loop trip count.
+      if (OuterLoop) {
+        ScalarEvolution *SE = MemCheckExp.getSE();
+        // TODO: If profitable, we could refine this further by analysing every
+        // individual memory check, since there could be a mixture of loop
+        // variant and invariant checks that mean the final condition is
+        // variant.
+        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+        if (SE->isLoopInvariant(Cond, OuterLoop)) {
+          // It seems reasonable to assume that we can reduce the effective
+          // cost of the checks even when we know nothing about the trip
+          // count. Assume that the outer loop executes at least twice.
+          unsigned BestTripCount = 2;
+
+          // If exact trip count is known use that.
+          if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
+            BestTripCount = SmallTC;
+          else if (LoopVectorizeWithBlockFrequency) {
+            // Else use profile data if available.
+            if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
+              BestTripCount = *EstimatedTC;
+          }
+
+          InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
+
+          // Let's ensure the cost is always at least 1.
+          NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
+                                     (InstructionCost::CostType)1);
+
+          LLVM_DEBUG(dbgs()
+                     << "We expect runtime memory checks to be hoisted "
+                     << "out of the outer loop. Cost reduced from "
+                     << MemCheckCost << " to " << NewMemCheckCost << '\n');
+
+          MemCheckCost = NewMemCheckCost;
+        }
+      }
+
+      RTCheckCost += MemCheckCost;
+    }
+
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
                         << "\n");
@@ -2144,8 +2194,8 @@ class GeneratedRTChecks {
 
     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
     // Create new preheader for vector loop.
-    if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
-      PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
+    if (OuterLoop)
+      OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
 
     SCEVCheckBlock->getTerminator()->eraseFromParent();
     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
@@ -2179,8 +2229,8 @@ class GeneratedRTChecks {
     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
     MemCheckBlock->moveBefore(LoopVectorPreHeader);
 
-    if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
-      PL->addBasicBlockToLoop(MemCheckBlock, *LI);
+    if (OuterLoop)
+      OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
 
     BranchInst &BI =
         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 601d2454c1e163..1fbd69e38eaeec 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10215,8 +10215,18 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       UniqueBases.insert(VecBase);
       // If the only one use is vectorized - can delete the extractelement
       // itself.
-      if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
-            return !R.ScalarToTreeEntry.count(U);
+      if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
+          any_of(EI->users(), [&](User *U) {
+            const TreeEntry *UTE = R.getTreeEntry(U);
+            return !UTE || R.MultiNodeScalars.contains(U) ||
+                   count_if(R.VectorizableTree,
+                            [&](const std::unique_ptr<TreeEntry> &TE) {
+                              return any_of(TE->UserTreeIndices,
+                                            [&](const EdgeInfo &Edge) {
+                                              return Edge.UserTE == UTE;
+                                            }) &&
+                                     is_contained(TE->Scalars, EI);
+                            }) != 1;
           }))
         continue;
       R.eraseInstruction(EI);
@@ -11643,12 +11653,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
         TysForDecl.push_back(
             FixedVectorType::get(CI->getType(), E->Scalars.size()));
+      auto *CEI = cast<CallInst>(VL0);
       for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
         ValueList OpVL;
         // Some intrinsics have scalar arguments. This argument should not be
         // vectorized.
         if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
-          CallInst *CEI = cast<CallInst>(VL0);
           ScalarArg = CEI->getArgOperand(I);
           OpVecs.push_back(CEI->getArgOperand(I));
           if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
@@ -11661,6 +11671,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
           return E->VectorizedValue;
         }
+        auto GetOperandSignedness = [&](unsigned Idx) {
+          const TreeEntry *OpE = getOperandEntry(E, Idx);
+          bool IsSigned = false;
+          auto It = MinBWs.find(OpE);
+          if (It != MinBWs.end())
+            IsSigned = It->second.second;
+          else
+            IsSigned = any_of(OpE->Scalars, [&](Value *R) {
+              return !isKnownNonNegative(R, SimplifyQuery(*DL));
+            });
+          return IsSigned;
+        };
+        ScalarArg = CEI->getArgOperand(I);
+        if (cast<VectorType>(OpVec->getType())->getElementType() !=
+            ScalarArg->getType()) {
+          auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
+                                              VecTy->getNumElements());
+          OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
+        }
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
         if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bbeb5da2cfec3e..ae2fc522ba4002 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -597,13 +597,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
     for (const auto &I : enumerate(operands())) {
       // Some intrinsics have a scalar argument - don't replace it with a
       // vector.
-      // Some vectorized function variants may also take a scalar argument,
-      // e.g. linear parameters for pointers.
       Value *Arg;
-      if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
-          (UseIntrinsic &&
-           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
+      if (UseIntrinsic &&
+          isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
         Arg = State.get(I.value(), VPIteration(0, 0));
+      // Some vectorized function variants may also take a scalar argument,
+      // e.g. linear parameters for pointers. This needs to be the scalar value
+      // from the start of the respective part when interleaving.
+      else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
+        Arg = State.get(I.value(), VPIteration(Part, 0));
       else
         Arg = State.get(I.value(), Part);
       if (UseIntrinsic &&
diff --git a/llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll b/llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll
new file mode 100644
index 00000000000000..cd99065f0285cd
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -mtriple=riscv64 -mattr=+f,+d --passes=loop-unroll-full -S | FileCheck %s
+
+; Check it doesn't crash when the vector extension is not enabled.
+define void @foo() {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[SPLAT_SPLAT_I_I_I:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CMP1_I_I_I:%.*]] = fcmp ogt <2 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[SPLAT_SPLAT3_I_I_I:%.*]] = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[XOR3_I_I_I_I_I:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[SPLAT_SPLAT8_I_I_I:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SUB_I_I_I:%.*]] = fsub <2 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 0, 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[SPLAT_SPLAT_I_I_I_I:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[XOR3_I_I_I_V_I_I:%.*]] = select <2 x i1> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV1]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV1]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv1 = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %0 = load float, ptr null, align 4
+  %splat.splat.i.i.i = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+  %cmp1.i.i.i = fcmp ogt <2 x float> zeroinitializer, zeroinitializer
+  %splat.splat3.i.i.i = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+  %xor3.i.i.i.i.i = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+  %1 = load float, ptr null, align 4
+  %splat.splat8.i.i.i = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+  %sub.i.i.i = fsub <2 x float> zeroinitializer, zeroinitializer
+  %mul.i.i.i = shl i64 0, 0
+  %2 = load float, ptr null, align 4
+  %splat.splat.i.i.i.i = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+  %xor3.i.i.i.v.i.i = select <2 x i1> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer
+  %indvars.iv.next = add i64 %indvars.iv1, 1
+  %exitcond = icmp ne i64 %indvars.iv1, 8
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:                                             ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll b/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll
index 98bb5f99a40a1e..fb296f5089422d 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll
@@ -7,8 +7,17 @@
 define void @test_scope_in_loop(ptr %arg, i64 %num) {
 ; CHECK-LABEL: 'test_scope_in_loop'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %load.prev = load i8, ptr %prev.ptr, align 1, !alias.scope !0, !noalias !3 ->
+; CHECK-NEXT:            store i8 %add, ptr %cur.ptr, align 1, !alias.scope !3
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %load.cur = load i8, ptr %cur.ptr, align 1, !alias.scope !3 ->
+; CHECK-NEXT:            store i8 %add, ptr %cur.ptr, align 1, !alias.scope !3
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
new file mode 100644
index 00000000000000..106dc8c13a49fa
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Test case for https://github.com/llvm/llvm-project/issues/82665.
+define void @indirect_ptr_recurrences_read_write(ptr %A, ptr %B) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_write'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndidrectUnsafe:
+; CHECK-NEXT:            %l = load i32, ptr %ptr.recur, align 4, !tbaa !4 ->
+; CHECK-NEXT:            store i32 %xor, ptr %ptr.recur, align 4, !tbaa !4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4, !tbaa !10
+  %xor = xor i32 %l, 1
+  store i32 %xor, ptr %ptr.recur, align 4, !tbaa !10
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define i32 @indirect_ptr_recurrences_read_only_loop(ptr %A, ptr %B) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_only_loop'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %xor, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4, !tbaa !10
+  %xor = xor i32 %l, 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %xor
+}
+
+define void @indirect_ptr_recurrences_read_write_may_alias_no_tbaa(ptr %A, ptr %B) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_write_may_alias_no_tbaa'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4
+  %xor = xor i32 %l, 1
+  store i32 %xor, ptr %ptr.recur, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @indirect_ptr_recurrences_read_write_may_alias_different_obj(ptr %A, ptr %B, ptr %C) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_write_may_alias_different_obj'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4
+  %xor = xor i32 %l, 1
+  %gep.C = getelementptr inbounds ptr, ptr %C, i64 %iv
+  store i32 %xor, ptr %gep.C, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @indirect_ptr_recurrences_read_write_may_noalias_different_obj(ptr %A, ptr %B, ptr noalias %C) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_write_may_noalias_different_obj'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4
+  %xor = xor i32 %l, 1
+  %gep.C = getelementptr inbounds ptr, ptr %C, i64 %iv
+  store i32 %xor, ptr %gep.C, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+!6 = !{!7, !7, i64 0}
+!7 = !{!"any pointer", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"int", !8, i64 0}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index a08ca86c8a619b..4b6f6159ff23f3 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -63,52 +63,140 @@ define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
 define amdgpu_kernel void @wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
 define amdgpu_kernel void @wmma_f32_16x16x16_ibf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
 define amdgpu_kernel void @wmma_f16_16x16x16_f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
 bb:
-  %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
+  %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
   store <16 x half> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
 define amdgpu_kernel void @wmma_f16_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
 bb:
-  %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
+  %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
   store <16 x i16> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
 define amdgpu_kernel void @wmma_i32_16x16x16_ui8(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 bb:
-  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
+  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
   store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
 define amdgpu_kernel void @wmma_i32_16x16x16_ui4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 bb:
-  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
+  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
   store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+    store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+    ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+    store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+    ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+    store <8 x half> %tmp0, ptr addrspace(1) %out, align 32
+    ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+  store <8 x i16> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+define amdgpu_kernel void @swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+define amdgpu_kernel void @swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+define amdgpu_kernel void @swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
 ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
 define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 bb:
@@ -190,12 +278,23 @@ declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half>, <16 x half> , <8 x float>) #1
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16>, <16 x i16> , <8 x float>) #1
-declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
-declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
index b0507e9d075fab..9687ba683fb7e6 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
@@ -35,16 +35,24 @@ define i8 @load_atomic_i8_aligned_monotonic_const(ptr readonly %ptr) {
 }
 
 define i8 @load_atomic_i8_aligned_acquire(ptr %ptr) {
-; CHECK-LABEL: load_atomic_i8_aligned_acquire:
-; CHECK:    ldapurb w0, [x0, #4]
+; GISEL-LABEL: load_atomic_i8_aligned_acquire:
+; GISEL:    add x8, x0, #4
+; GISEL:    ldaprb w0, [x8]
+;
+; SDAG-LABEL: load_atomic_i8_aligned_acquire:
+; SDAG:    ldapurb w0, [x0, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
 }
 
 define i8 @load_atomic_i8_aligned_acquire_const(ptr readonly %ptr) {
-; CHECK-LABEL: load_atomic_i8_aligned_acquire_const:
-; CHECK:    ldapurb w0, [x0, #4]
+; GISEL-LABEL: load_atomic_i8_aligned_acquire_const:
+; GISEL:    add x8, x0, #4
+; GISEL:    ldaprb w0, [x8]
+;
+; SDAG-LABEL: load_atomic_i8_aligned_acquire_const:
+; SDAG:    ldapurb w0, [x0, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -101,16 +109,24 @@ define i16 @load_atomic_i16_aligned_monotonic_const(ptr readonly %ptr) {
 }
 
 define i16 @load_atomic_i16_aligned_acquire(ptr %ptr) {
-; CHECK-LABEL: load_atomic_i16_aligned_acquire:
-; CHECK:    ldapurh w0, [x0, #8]
+; GISEL-LABEL: load_atomic_i16_aligned_acquire:
+; GISEL:    add x8, x0, #8
+; GISEL:    ldaprh w0, [x8]
+;
+; SDAG-LABEL: load_atomic_i16_aligned_acquire:
+; SDAG:    ldapurh w0, [x0, #8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     %r = load atomic i16, ptr %gep acquire, align 2
     ret i16 %r
 }
 
 define i16 @load_atomic_i16_aligned_acquire_const(ptr readonly %ptr) {
-; CHECK-LABEL: load_atomic_i16_aligned_acquire_const:
-; CHECK:    ldapurh w0, [x0, #8]
+; GISEL-LABEL: load_atomic_i16_aligned_acquire_const:
+; GISEL:    add x8, x0, #8
+; GISEL:    ldaprh w0, [x8]
+;
+; SDAG-LABEL: load_atomic_i16_aligned_acquire_const:
+; SDAG:    ldapurh w0, [x0, #8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     %r = load atomic i16, ptr %gep acquire, align 2
     ret i16 %r
@@ -367,16 +383,24 @@ define i8 @load_atomic_i8_unaligned_monotonic_const(ptr readonly %ptr) {
 }
 
 define i8 @load_atomic_i8_unaligned_acquire(ptr %ptr) {
-; CHECK-LABEL: load_atomic_i8_unaligned_acquire:
-; CHECK:    ldapurb w0, [x0, #4]
+; GISEL-LABEL: load_atomic_i8_unaligned_acquire:
+; GISEL:    add x8, x0, #4
+; GISEL:    ldaprb w0, [x8]
+;
+; SDAG-LABEL: load_atomic_i8_unaligned_acquire:
+; SDAG:    ldapurb w0, [x0, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
 }
 
 define i8 @load_atomic_i8_unaligned_acquire_const(ptr readonly %ptr) {
-; CHECK-LABEL: load_atomic_i8_unaligned_acquire_const:
-; CHECK:    ldapurb w0, [x0, #4]
+; GISEL-LABEL: load_atomic_i8_unaligned_acquire_const:
+; GISEL:    add x8, x0, #4
+; GISEL:    ldaprb w0, [x8]
+;
+; SDAG-LABEL: load_atomic_i8_unaligned_acquire_const:
+; SDAG:    ldapurb w0, [x0, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -819,7 +843,8 @@ define i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %ptr) {
 define i8 @load_atomic_i8_from_gep() {
 ; GISEL-LABEL: load_atomic_i8_from_gep:
 ; GISEL:    bl init
-; GISEL:    ldapurb w0, [x8, #1]
+; GISEL:    add x8, x8, #1
+; GISEL:    ldaprb w0, [x8]
 ;
 ; SDAG-LABEL: load_atomic_i8_from_gep:
 ; SDAG:    bl init
@@ -834,7 +859,8 @@ define i8 @load_atomic_i8_from_gep() {
 define i16 @load_atomic_i16_from_gep() {
 ; GISEL-LABEL: load_atomic_i16_from_gep:
 ; GISEL:    bl init
-; GISEL:    ldapurh w0, [x8, #2]
+; GISEL:    add x8, x8, #2
+; GISEL:    ldaprh w0, [x8]
 ;
 ; SDAG-LABEL: load_atomic_i16_from_gep:
 ; SDAG:    bl init
@@ -884,7 +910,6 @@ define i128 @load_atomic_i128_from_gep() {
 ;
 ; SDAG-LABEL: load_atomic_i128_from_gep:
 ; SDAG:    bl init
-; SDAG:    ldp x0, x1, [sp, #16]
 ; SDAG:    dmb ishld
   %a = alloca [3 x i128]
   call void @init(ptr %a)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index 0e9c126e97a3d8..6152baf2e40f17 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -993,24 +993,24 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1:       ; %bb.0:
 ; CHECK-NOLSE-O1-NEXT:    ldrb w8, [x0, #4095]
 ; CHECK-NOLSE-O1-NEXT:    ldrb w9, [x0, w1, sxtw]
-; CHECK-NOLSE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
 ; CHECK-NOLSE-O1-NEXT:    ldurb w10, [x0, #-256]
-; CHECK-NOLSE-O1-NEXT:    add w8, w8, w9
-; CHECK-NOLSE-O1-NEXT:    ldrb w9, [x11]
-; CHECK-NOLSE-O1-NEXT:    add w8, w8, w10
-; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
+; CHECK-NOLSE-O1-NEXT:    add w8, w9, w8, uxtb
+; CHECK-NOLSE-O1-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-NOLSE-O1-NEXT:    ldrb w9, [x9]
+; CHECK-NOLSE-O1-NEXT:    add w8, w8, w10, uxtb
+; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9, uxtb
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
 ; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_8:
 ; CHECK-OUTLINE-O1:       ; %bb.0:
 ; CHECK-OUTLINE-O1-NEXT:    ldrb w8, [x0, #4095]
 ; CHECK-OUTLINE-O1-NEXT:    ldrb w9, [x0, w1, sxtw]
-; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
 ; CHECK-OUTLINE-O1-NEXT:    ldurb w10, [x0, #-256]
-; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w9
-; CHECK-OUTLINE-O1-NEXT:    ldrb w9, [x11]
-; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10
-; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    add w8, w9, w8, uxtb
+; CHECK-OUTLINE-O1-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldrb w9, [x9]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10, uxtb
+; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9, uxtb
 ; CHECK-OUTLINE-O1-NEXT:    ret
 ;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_8:
@@ -1045,12 +1045,12 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 {
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldrb w8, [x0, #4095]
 ; CHECK-LSE-O1-NEXT:    ldrb w9, [x0, w1, sxtw]
-; CHECK-LSE-O1-NEXT:    ldurb w10, [x0, #-256]
-; CHECK-LSE-O1-NEXT:    add w8, w8, w10
-; CHECK-LSE-O1-NEXT:    add w8, w8, w9
+; CHECK-LSE-O1-NEXT:    add w8, w9, w8, uxtb
+; CHECK-LSE-O1-NEXT:    ldurb w9, [x0, #-256]
+; CHECK-LSE-O1-NEXT:    add w8, w8, w9, uxtb
 ; CHECK-LSE-O1-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
 ; CHECK-LSE-O1-NEXT:    ldrb w9, [x9]
-; CHECK-LSE-O1-NEXT:    add w0, w8, w9
+; CHECK-LSE-O1-NEXT:    add w0, w8, w9, uxtb
 ; CHECK-LSE-O1-NEXT:    ret
 ;
 ; CHECK-LSE-O0-LABEL: atomic_load_relaxed_8:
@@ -1089,24 +1089,24 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1:       ; %bb.0:
 ; CHECK-NOLSE-O1-NEXT:    ldrh w8, [x0, #8190]
 ; CHECK-NOLSE-O1-NEXT:    ldrh w9, [x0, w1, sxtw #1]
-; CHECK-NOLSE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
 ; CHECK-NOLSE-O1-NEXT:    ldurh w10, [x0, #-256]
-; CHECK-NOLSE-O1-NEXT:    add w8, w8, w9
-; CHECK-NOLSE-O1-NEXT:    ldrh w9, [x11]
-; CHECK-NOLSE-O1-NEXT:    add w8, w8, w10
-; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
+; CHECK-NOLSE-O1-NEXT:    add w8, w9, w8, uxth
+; CHECK-NOLSE-O1-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-NOLSE-O1-NEXT:    ldrh w9, [x9]
+; CHECK-NOLSE-O1-NEXT:    add w8, w8, w10, uxth
+; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9, uxth
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
 ; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_16:
 ; CHECK-OUTLINE-O1:       ; %bb.0:
 ; CHECK-OUTLINE-O1-NEXT:    ldrh w8, [x0, #8190]
 ; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x0, w1, sxtw #1]
-; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
 ; CHECK-OUTLINE-O1-NEXT:    ldurh w10, [x0, #-256]
-; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w9
-; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x11]
-; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10
-; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    add w8, w9, w8, uxth
+; CHECK-OUTLINE-O1-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x9]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10, uxth
+; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9, uxth
 ; CHECK-OUTLINE-O1-NEXT:    ret
 ;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_16:
@@ -1141,12 +1141,12 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 {
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldrh w8, [x0, #8190]
 ; CHECK-LSE-O1-NEXT:    ldrh w9, [x0, w1, sxtw #1]
-; CHECK-LSE-O1-NEXT:    ldurh w10, [x0, #-256]
-; CHECK-LSE-O1-NEXT:    add w8, w8, w10
-; CHECK-LSE-O1-NEXT:    add w8, w8, w9
+; CHECK-LSE-O1-NEXT:    add w8, w9, w8, uxth
+; CHECK-LSE-O1-NEXT:    ldurh w9, [x0, #-256]
+; CHECK-LSE-O1-NEXT:    add w8, w8, w9, uxth
 ; CHECK-LSE-O1-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
 ; CHECK-LSE-O1-NEXT:    ldrh w9, [x9]
-; CHECK-LSE-O1-NEXT:    add w0, w8, w9
+; CHECK-LSE-O1-NEXT:    add w0, w8, w9, uxth
 ; CHECK-LSE-O1-NEXT:    ret
 ;
 ; CHECK-LSE-O0-LABEL: atomic_load_relaxed_16:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index 5a7bd6ee20f9b4..22e283b0e0994a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -385,13 +385,13 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) {
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDRBBui renamable $x0, 4095, pcsections !0 :: (load monotonic (s8) from %ir.ptr_unsigned)
-  ; CHECK-NEXT:   renamable $w9 = LDRBBroW renamable $x0, killed renamable $w1, 1, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_regoff)
-  ; CHECK-NEXT:   renamable $w10 = LDURBBi renamable $x0, -256, pcsections !0 :: (load monotonic (s8) from %ir.ptr_unscaled)
-  ; CHECK-NEXT:   renamable $x11 = ADDXri killed renamable $x0, 291, 12
-  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = LDRBBui killed renamable $x11, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_random)
-  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w10, 0, pcsections !0
-  ; CHECK-NEXT:   $w0 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = LDRBBroW renamable $x0, killed renamable $w1, 1, 0 :: (load unordered (s8) from %ir.ptr_regoff)
+  ; CHECK-NEXT:   renamable $w10 = LDURBBi renamable $x0, -256 :: (load monotonic (s8) from %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $w8 = ADDWrx killed renamable $w9, killed renamable $w8, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $x9 = ADDXri killed renamable $x0, 291, 12
+  ; CHECK-NEXT:   renamable $w8 = ADDWrx killed renamable $w8, killed renamable $w10, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = LDRBBui killed renamable $x9, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_random)
+  ; CHECK-NEXT:   renamable $w0 = ADDWrx killed renamable $w8, killed renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %ptr_unsigned = getelementptr i8, ptr %p, i32 4095
   %val_unsigned = load atomic i8, ptr %ptr_unsigned monotonic, align 1, !pcsections !0
@@ -417,13 +417,13 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) {
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDRHHui renamable $x0, 4095, pcsections !0 :: (load monotonic (s16) from %ir.ptr_unsigned)
-  ; CHECK-NEXT:   renamable $w9 = LDRHHroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s16) from %ir.ptr_regoff)
-  ; CHECK-NEXT:   renamable $w10 = LDURHHi renamable $x0, -256, pcsections !0 :: (load monotonic (s16) from %ir.ptr_unscaled)
-  ; CHECK-NEXT:   renamable $x11 = ADDXri killed renamable $x0, 291, 12
-  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = LDRHHui killed renamable $x11, 0, pcsections !0 :: (load unordered (s16) from %ir.ptr_random)
-  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w10, 0, pcsections !0
-  ; CHECK-NEXT:   $w0 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = LDRHHroW renamable $x0, killed renamable $w1, 1, 1 :: (load unordered (s16) from %ir.ptr_regoff)
+  ; CHECK-NEXT:   renamable $w10 = LDURHHi renamable $x0, -256 :: (load monotonic (s16) from %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $w8 = ADDWrx killed renamable $w9, killed renamable $w8, 8, pcsections !0
+  ; CHECK-NEXT:   renamable $x9 = ADDXri killed renamable $x0, 291, 12
+  ; CHECK-NEXT:   renamable $w8 = ADDWrx killed renamable $w8, killed renamable $w10, 8, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = LDRHHui killed renamable $x9, 0, pcsections !0 :: (load unordered (s16) from %ir.ptr_random)
+  ; CHECK-NEXT:   renamable $w0 = ADDWrx killed renamable $w8, killed renamable $w9, 8, pcsections !0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %ptr_unsigned = getelementptr i16, ptr %p, i32 4095
   %val_unsigned = load atomic i16, ptr %ptr_unsigned monotonic, align 2, !pcsections !0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/atomic-anyextending-load-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/atomic-anyextending-load-crash.ll
new file mode 100644
index 00000000000000..4bb4e4882410dc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/atomic-anyextending-load-crash.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel -global-isel-abort=1 -O0 -o - %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64e-apple-macosx14.0.0"
+
+define void @test(ptr %0) {
+; CHECK-LABEL: test:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #144
+; CHECK-NEXT:    stp x29, x30, [sp, #128] ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 144
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ldar w8, [x0]
+; CHECK-NEXT:    str w8, [sp, #116] ; 4-byte Folded Spill
+; CHECK-NEXT:    mov x8, #0 ; =0x0
+; CHECK-NEXT:    str x8, [sp, #120] ; 8-byte Folded Spill
+; CHECK-NEXT:    blr x8
+; CHECK-NEXT:    ldr w11, [sp, #116] ; 4-byte Folded Reload
+; CHECK-NEXT:    ldr x8, [sp, #120] ; 8-byte Folded Reload
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    str xzr, [x9, #8]
+; CHECK-NEXT:    str xzr, [x9, #16]
+; CHECK-NEXT:    str xzr, [x9, #24]
+; CHECK-NEXT:    str xzr, [x9, #32]
+; CHECK-NEXT:    str xzr, [x9, #40]
+; CHECK-NEXT:    ; implicit-def: $x10
+; CHECK-NEXT:    mov x10, x11
+; CHECK-NEXT:    str x10, [x9, #48]
+; CHECK-NEXT:    str xzr, [x9, #56]
+; CHECK-NEXT:    str xzr, [x9, #64]
+; CHECK-NEXT:    str xzr, [x9, #72]
+; CHECK-NEXT:    str xzr, [x9, #80]
+; CHECK-NEXT:    str xzr, [x9, #88]
+; CHECK-NEXT:    str xzr, [x9, #96]
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    blr x8
+; CHECK-NEXT:    ldp x29, x30, [sp, #128] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #144
+; CHECK-NEXT:    ret
+entry:
+  %atomic-load = load atomic i32, ptr %0 seq_cst, align 4
+  %call10 = call ptr null()
+  call void (ptr, ...) null(ptr null, ptr null, i32 0, ptr null, ptr null, i32 0, i32 0, i32 %atomic-load, i32 0, i32 0, i32 0, i32 0, i64 0, ptr null)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
index 16b780a8397347..661265173ae82b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
@@ -529,3 +529,27 @@ body:             |
     RET_ReallyLR implicit $q0
 
 ...
+
+---
+name:            pr81244
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name: pr81244
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s8>), [[TRUNC]](<2 x s8>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[CONCAT_VECTORS]](<4 x s8>)
+    ; CHECK-NEXT: $d0 = COPY [[ANYEXT]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s8>) = G_TRUNC %0(<2 x s32>)
+    %2:_(<4 x s8>) = G_CONCAT_VECTORS %1(<2 x s8>), %1(<2 x s8>)
+    %3:_(<4 x s16>) = G_ANYEXT %2(<4 x s8>)
+    $d0 = COPY %3(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
index 07744dada4f1fa..b1166e683ec74e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
@@ -323,3 +323,22 @@ define i32 @test_alias_3xs16(ptr %ptr, ptr %ptr2, ptr %ptr3, ptr noalias %safe_p
   store i32 14, ptr %addr4
   ret i32 %safeld
 }
+
+@G = external global [10 x i32]
+
+define void @invalid_zero_offset_no_merge(i64 %0) {
+; CHECK-LABEL: invalid_zero_offset_no_merge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x8, _G@GOTPAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    ldr x8, [x8, _G@GOTPAGEOFF]
+; CHECK-NEXT:    str wzr, [x8, x0, lsl #2]
+; CHECK-NEXT:    str wzr, [x8, #4]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdrGot Lloh0, Lloh1
+  %2 = getelementptr [10 x i32], ptr @G, i64 0, i64 %0
+  store i32 0, ptr %2, align 4
+  store i32 0, ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1), align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir
index e98e1ce599f2f6..69457a8cc0c19f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir
@@ -162,6 +162,13 @@
     ret void
   }
 
+  @G = external global [10 x i32]
+  define void @invalid_zero_offset_no_merge(i64 %0) {
+    %2 = getelementptr [10 x i32], ptr @G, i64 0, i64 %0
+    store i32 0, ptr %2, align 4
+    store i32 0, ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1), align 4
+    ret void
+  }
 ...
 ---
 name:            test_simple_2xs8
@@ -582,13 +589,11 @@ liveins:
 frameInfo:
   maxAlignment:    1
 machineFunctionInfo: {}
+# The store to ptr2 prevents merging into a single store.
+# We can still merge the stores into addr1 and addr2.
 body:             |
   bb.1 (%ir-block.0):
     liveins: $x0, $x1
-
-    ; The store to ptr2 prevents merging into a single store.
-    ; We can still merge the stores into addr1 and addr2.
-
     ; CHECK-LABEL: name: test_alias_4xs16
     ; CHECK: liveins: $x0, $x1
     ; CHECK-NEXT: {{  $}}
@@ -639,10 +644,10 @@ liveins:
 frameInfo:
   maxAlignment:    1
 machineFunctionInfo: {}
+# Here store of 5 and 9 can be merged, others have aliasing barriers.
 body:             |
   bb.1 (%ir-block.0):
     liveins: $x0, $x1, $x2
-    ; Here store of 5 and 9 can be merged, others have aliasing barriers.
     ; CHECK-LABEL: name: test_alias2_4xs16
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
@@ -698,12 +703,11 @@ liveins:
 frameInfo:
   maxAlignment:    1
 machineFunctionInfo: {}
+# No merging can be done here.
 body:             |
   bb.1 (%ir-block.0):
     liveins: $x0, $x1, $x2, $x3
 
-    ; No merging can be done here.
-
     ; CHECK-LABEL: name: test_alias3_4xs16
     ; CHECK: liveins: $x0, $x1, $x2, $x3
     ; CHECK-NEXT: {{  $}}
@@ -767,12 +771,10 @@ stack:
   - { id: 0, name: a1, size: 24, alignment: 4 }
   - { id: 1, name: a2, size: 4, alignment: 4 }
 machineFunctionInfo: {}
+# Can merge because the load is from a different alloca and can't alias.
 body:             |
   bb.1 (%ir-block.0):
     liveins: $x0
-
-    ; Can merge because the load is from a different alloca and can't alias.
-
     ; CHECK-LABEL: name: test_alias_allocas_2xs32
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
@@ -826,3 +828,43 @@ body:             |
     RET_ReallyLR
 
 ...
+---
+name:            invalid_zero_offset_no_merge
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.1):
+    liveins: $x0
+
+    ; CHECK-LABEL: name: invalid_zero_offset_no_merge
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @G
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[GV]], [[SHL]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: G_STORE [[C1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %ir.2)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw G_PTR_ADD [[GV]], [[C2]](s64)
+    ; CHECK-NEXT: G_STORE [[C1]](s32), [[PTR_ADD1]](p0) :: (store (s32) into `ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1)`)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s64) = COPY $x0
+    %9:_(s64) = G_CONSTANT i64 2
+    %3:_(s64) = G_SHL %0, %9(s64)
+    %1:_(p0) = G_GLOBAL_VALUE @G
+    %4:_(p0) = G_PTR_ADD %1, %3(s64)
+    %6:_(s32) = G_CONSTANT i32 0
+    G_STORE %6(s32), %4(p0) :: (store (s32) into %ir.2)
+    %8:_(s64) = G_CONSTANT i64 4
+    %7:_(p0) = nuw G_PTR_ADD %1, %8(s64)
+    G_STORE %6(s32), %7(p0) :: (store (s32) into `ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1)`)
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll b/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll
new file mode 100644
index 00000000000000..a8cba7dc9a91e9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu -stop-after=aarch64-isel < %s -o - | FileCheck %s
+
+define void @alpha(<vscale x 4 x i32> %x) local_unnamed_addr {
+entry:
+; CHECK: INLINEASM &"movt zt0[3, mul vl], z0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $za
+  tail call void asm sideeffect "movt zt0[3, mul vl], z0", "~{za}"()
+  ret void
+}
+
+define void @beta(<vscale x 4 x i32> %x) local_unnamed_addr {
+entry:
+; CHECK: INLINEASM &"movt zt0[3, mul vl], z0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $zt0
+  tail call void asm sideeffect "movt zt0[3, mul vl], z0", "~{zt0}"()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 2181eaaee7db68..d39029163a47aa 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -214,8 +214,9 @@ define void @t17(i64 %a) {
 define i8 @LdOffset_i8(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
-; CHECK-NEXT:    ldrb w0, [x8, #3704]
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrb w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -226,8 +227,9 @@ define i8 @LdOffset_i8(ptr %a)  {
 define i32 @LdOffset_i8_zext32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_zext32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
-; CHECK-NEXT:    ldrb w0, [x8, #3704]
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrb w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -253,8 +255,9 @@ define i32 @LdOffset_i8_sext32(ptr %a)  {
 define i64 @LdOffset_i8_zext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_zext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
-; CHECK-NEXT:    ldrb w0, [x8, #3704]
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrb w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index fe4da2e7cf36b5..89c8d540b97e04 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -1848,3 +1848,51 @@ define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) {
   %absel = select <2 x i1> %abcmp, <2 x i128> %ababs, <2 x i128> %abdiff
   ret <2 x i128> %absel
 }
+
+define <8 x i16> @pr88784(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) {
+; CHECK-SD-LABEL: pr88784:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    usubl.8h v0, v0, v1
+; CHECK-SD-NEXT:    cmlt.8h v1, v2, #0
+; CHECK-SD-NEXT:    ssra.8h v0, v2, #15
+; CHECK-SD-NEXT:    eor.16b v0, v1, v0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: pr88784:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    usubl.8h v0, v0, v1
+; CHECK-GI-NEXT:    sshr.8h v1, v2, #15
+; CHECK-GI-NEXT:    ssra.8h v0, v2, #15
+; CHECK-GI-NEXT:    eor.16b v0, v1, v0
+; CHECK-GI-NEXT:    ret
+  %l4 = zext <8 x i8> %l0 to <8 x i16>
+  %l5 = ashr <8 x i16> %l2, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %l6 = zext <8 x i8> %l1 to <8 x i16>
+  %l7 = sub <8 x i16> %l4, %l6
+  %l8 = add <8 x i16> %l5, %l7
+  %l9 = xor <8 x i16> %l5, %l8
+  ret <8 x i16> %l9
+}
+
+define <8 x i16> @pr88784_fixed(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) {
+; CHECK-SD-LABEL: pr88784_fixed:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uabdl.8h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: pr88784_fixed:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    usubl.8h v0, v0, v1
+; CHECK-GI-NEXT:    sshr.8h v1, v0, #15
+; CHECK-GI-NEXT:    ssra.8h v0, v0, #15
+; CHECK-GI-NEXT:    eor.16b v0, v1, v0
+; CHECK-GI-NEXT:    ret
+  %l4 = zext <8 x i8> %l0 to <8 x i16>
+  %l6 = zext <8 x i8> %l1 to <8 x i16>
+  %l7 = sub <8 x i16> %l4, %l6
+  %l5 = ashr <8 x i16> %l7, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %l8 = add <8 x i16> %l5, %l7
+  %l9 = xor <8 x i16> %l5, %l8
+  ret <8 x i16> %l9
+}
+
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index 5c56f51e1ca554..bb9ba05f7a2724 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -147,8 +147,8 @@ define void @has_varargs(...) nounwind {
 ; CHECK-NEXT:     add     x29, sp, #160
 ; CHECK-NEXT:     .seh_add_fp     160
 ; CHECK-NEXT:     .seh_endprologue
-; CHECK-NEXT:     ldp     x8, x5, [x4, #32]
-; CHECK-NEXT:     mov     x4, x8
+; CHECK-NEXT:     add     x4, x4, #32
+; CHECK-NEXT:     mov     x5, xzr
 ; CHECK-NEXT:     blr     x9
 ; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
 ; CHECK-NEXT:     ldr     x0, [x8, :lo12:__os_arm64x_dispatch_ret]
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
index dc16b3a1a0f270..844fc52ddade63 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
@@ -100,5 +100,42 @@ define void @varargs_many_argscalleer() nounwind {
   ret void
 }
 
+define void @varargs_caller_tail() nounwind {
+; CHECK-LABEL: varargs_caller_tail:
+; CHECK:        // %bb.0:
+; CHECK-NEXT:        sub     sp, sp, #48
+; CHECK-NEXT:        mov     x4, sp
+; CHECK-NEXT:        add     x8, sp, #16
+; CHECK-NEXT:        mov     x9, #4617315517961601024        // =0x4014000000000000
+; CHECK-NEXT:        mov     x0, #4607182418800017408        // =0x3ff0000000000000
+; CHECK-NEXT:        mov     w1, #2                          // =0x2
+; CHECK-NEXT:        mov     x2, #4613937818241073152        // =0x4008000000000000
+; CHECK-NEXT:        mov     w3, #4                          // =0x4
+; CHECK-NEXT:        mov     w5, #16                         // =0x10
+; CHECK-NEXT:        stp     xzr, x30, [sp, #24]             // 8-byte Folded Spill
+; CHECK-NEXT:        stp     x9, x8, [sp]
+; CHECK-NEXT:        str     xzr, [sp, #16]
+; CHECK-NEXT:        .weak_anti_dep  varargs_callee
+; CHECK-NEXT:.set varargs_callee, "#varargs_callee"@WEAKREF
+; CHECK-NEXT:        .weak_anti_dep  "#varargs_callee"
+; CHECK-NEXT:.set "#varargs_callee", varargs_callee@WEAKREF
+; CHECK-NEXT:        bl      "#varargs_callee"
+; CHECK-NEXT:        ldr     x30, [sp, #32]                  // 8-byte Folded Reload
+; CHECK-NEXT:        add     x4, sp, #48
+; CHECK-NEXT:        mov     x0, #4607182418800017408        // =0x3ff0000000000000
+; CHECK-NEXT:        mov     w1, #4                          // =0x4
+; CHECK-NEXT:        mov     w2, #3                          // =0x3
+; CHECK-NEXT:        mov     w3, #2                          // =0x2
+; CHECK-NEXT:        mov     x5, xzr
+; CHECK-NEXT:        add     sp, sp, #48
+; CHECK-NEXT:        .weak_anti_dep  varargs_callee
+; CHECK-NEXT:.set varargs_callee, "#varargs_callee"@WEAKREF
+; CHECK-NEXT:        .weak_anti_dep  "#varargs_callee"
+; CHECK-NEXT:.set "#varargs_callee", varargs_callee@WEAKREF
+; CHECK-NEXT:        b       "#varargs_callee"
+  call void (double, ...) @varargs_callee(double 1.0, i32 2, double 3.0, i32 4, double 5.0, <2 x double> <double 0.0, double 0.0>)
+  tail call void (double, ...) @varargs_callee(double 1.0, i32 4, i32 3, i32 2)
+  ret void
+}
 
 declare void @llvm.va_start(ptr)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
index 93497f38063d28..7b8448de2331b4 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
 ; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve -o - | FileCheck %s
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve2 -o - | FileCheck %s
 
 target triple = "aarch64"
 
@@ -158,6 +159,32 @@ entry:
   ret <16 x half> %interleaved.vec
 }
 
+
+; Expected not to transform as it is integer
+define <16 x i16> @complex_add_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: complex_add_v16i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v4.8h, v2.8h, v3.8h
+; CHECK-NEXT:    uzp1 v5.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uzp2 v1.8h, v2.8h, v3.8h
+; CHECK-NEXT:    sub v2.8h, v4.8h, v0.8h
+; CHECK-NEXT:    add v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    zip1 v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    zip2 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x i16> %b, <16 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x i16> %b, <16 x i16> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = sub <8 x i16> %b.real, %a.imag
+  %1 = add <8 x i16> %b.imag, %a.real
+  %interleaved.vec = shufflevector <8 x i16> %0, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x i16> %interleaved.vec
+}
+
+
 declare { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
 declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
 
diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll
index b24866064efaea..7b45d0f30bcdd4 100644
--- a/llvm/test/CodeGen/AArch64/cpus.ll
+++ b/llvm/test/CodeGen/AArch64/cpus.ll
@@ -37,6 +37,7 @@
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=a64fx 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1a 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1b 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
index 15b6700398ea08..488f1ffdb52f3b 100755
--- a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
+++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
@@ -14,8 +14,9 @@ body:             |
     ; CHECK-LABEL: name: LdOffset
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
-    ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704
+    ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
+    ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
     ; CHECK-NEXT: RET undef $lr, implicit $w0
     renamable $w8 = MOVZWi 56952, 0
     renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
diff --git a/llvm/test/CodeGen/AArch64/neon-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
index 23d1e43a5cab1b..cf09a46000dab9 100644
--- a/llvm/test/CodeGen/AArch64/neon-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
@@ -7,6 +7,7 @@
 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2  < %s | FileCheck %s
 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1      < %s | FileCheck %s
 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1a     < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1b     < %s | FileCheck %s
 
 declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
 declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/remat.ll b/llvm/test/CodeGen/AArch64/remat.ll
index 483c4d71ee21fb..704c87feb6a9b8 100644
--- a/llvm/test/CodeGen/AArch64/remat.ll
+++ b/llvm/test/CodeGen/AArch64/remat.ll
@@ -26,6 +26,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx3t110 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1a -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1b -o - %s | FileCheck %s
 
 %X = type { i64, i64, i64 }
 declare void @f(ptr)
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index e18e18a1cfad18..381091b4539433 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -23,9 +23,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ; CHECK-FISEL-NEXT:    bl streaming_callee
 ; CHECK-FISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-FISEL-NEXT:    smstop sm
+; CHECK-FISEL-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-FISEL-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-FISEL-NEXT:    ldr d0, [x8, :lo12:.LCPI0_0]
-; CHECK-FISEL-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-FISEL-NEXT:    fadd d0, d1, d0
 ; CHECK-FISEL-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-FISEL-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -49,9 +49,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ; CHECK-GISEL-NEXT:    bl streaming_callee
 ; CHECK-GISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    smstop sm
+; CHECK-GISEL-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-GISEL-NEXT:    fmov d0, x8
-; CHECK-GISEL-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-GISEL-NEXT:    fadd d0, d1, d0
 ; CHECK-GISEL-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-GISEL-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -82,9 +82,9 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    bl normal_callee
 ; CHECK-COMMON-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
+; CHECK-COMMON-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-COMMON-NEXT:    fmov d0, x8
-; CHECK-COMMON-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    fadd d0, d1, d0
 ; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -110,14 +110,16 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli
 ; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str d0, [sp, #24] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
+; CHECK-COMMON-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    str d0, [sp, #24] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl normal_callee
 ; CHECK-COMMON-NEXT:    str d0, [sp, #16] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
+; CHECK-COMMON-NEXT:    ldr d1, [sp, #16] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-COMMON-NEXT:    fmov d0, x8
-; CHECK-COMMON-NEXT:    ldr d1, [sp, #16] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    fadd d0, d1, d0
 ; CHECK-COMMON-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
@@ -329,9 +331,9 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
-; CHECK-COMMON-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
@@ -390,9 +392,9 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp s1, s0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
-; CHECK-COMMON-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp s1, s0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl fmodf
 ; CHECK-COMMON-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
@@ -420,7 +422,9 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
 ; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    bl __arm_sme_state
+; CHECK-COMMON-NEXT:    ldp s2, s0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    and x19, x0, #0x1
+; CHECK-COMMON-NEXT:    stp s2, s0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    tbz w19, #0, .LBB12_2
 ; CHECK-COMMON-NEXT:  // %bb.1:
 ; CHECK-COMMON-NEXT:    smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
new file mode 100644
index 00000000000000..d5bea725b6d14d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -0,0 +1,1640 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-unknown-eabi-elf"
+
+; This test verifies that call arguments and results are not coalesced
+; with SVE vector registers by the coalescer, such that no 'mul vl'
+; ldr/str pairs are generated in the streaming-mode-changing call
+; sequence.
+
+;
+; Scalar arguments
+;
+
+define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl use_i8
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 16 x i8> poison, i8 %arg, i32 0
+  call void @use_i8(i8 %arg)
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl use_i16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 8 x i16> poison, i16 %arg, i32 0
+  call void @use_i16(i16 %arg)
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl use_i32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 4 x i32> poison, i32 %arg, i32 0
+  call void @use_i32(i32 %arg)
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl use_i64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 2 x i64> poison, i64 %arg, i32 0
+  call void @use_i64(i64 %arg)
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT:    bl use_f16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 8 x half> poison, half %arg, i32 0
+  call void @use_f16(half %arg)
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    bl use_f32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 4 x float> poison, float %arg, i32 0
+  call void @use_f32(float %arg)
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_f64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 2 x double> poison, double %arg, i32 0
+  call void @use_f64(double %arg)
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+
+;
+; Single-element vector arguments
+;
+
+define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v16i8
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x i8> %arg, i32 0
+  %vec = insertelement <vscale x 16 x i8> poison, i8 %elt, i32 0
+  call void @use_v16i8(<1 x i8> %arg)
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v8i16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x i16> %arg, i32 0
+  %vec = insertelement <vscale x 8 x i16> poison, i16 %elt, i32 0
+  call void @use_v8i16(<1 x i16> %arg)
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v4i32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x i32> %arg, i32 0
+  %vec = insertelement <vscale x 4 x i32> poison, i32 %elt, i32 0
+  call void @use_v4i32(<1 x i32> %arg)
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v2i64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x i64> %arg, i32 0
+  %vec = insertelement <vscale x 2 x i64> poison, i64 %elt, i32 0
+  call void @use_v2i64(<1 x i64> %arg)
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT:    bl use_v8f16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x half> %arg, i32 0
+  %vec = insertelement <vscale x 8 x half> poison, half %elt, i32 0
+  call void @use_v8f16(<1 x half> %arg)
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v4f32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x float> %arg, i32 0
+  %vec = insertelement <vscale x 4 x float> poison, float %elt, i32 0
+  call void @use_v4f32(<1 x float> %arg)
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v2f64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x double> %arg, i32 0
+  %vec = insertelement <vscale x 2 x double> poison, double %elt, i32 0
+  call void @use_v2f64(<1 x double> %arg)
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+;
+; Full vector arguments
+;
+
+define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v16i8
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %arg, i64 0)
+  call void @use_v16i8(<16 x i8> %arg)
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v8i16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> %arg, i64 0)
+  call void @use_v8i16(<8 x i16> %arg)
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v4i32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> %arg, i64 0)
+  call void @use_v4i32(<4 x i32> %arg)
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v2i64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %arg, i64 0)
+  call void @use_v2i64(<2 x i64> %arg)
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v8f16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %arg, i64 0)
+  call void @use_v8f16(<8 x half> %arg)
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v8bf16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> %arg, i64 0)
+  call void @use_v8bf16(<8 x bfloat> %arg)
+  store <vscale x 8 x bfloat> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v4f32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %arg, i64 0)
+  call void @use_v4f32(<4 x float> %arg)
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v2f64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %arg, i64 0)
+  call void @use_v2f64(<2 x double> %arg)
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+;
+; <8 x i1> type will need type promotion.
+;
+define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    and z1.b, z1.b, #0x1
+; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
+; CHECK-NEXT:    str p0, [x8, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v8i1
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr p0, [x8, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    str p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 16 x i1> @llvm.vector.insert.nxv8i1.v8i1(<vscale x 16 x i1> poison, <8 x i1> %arg, i64 0)
+  call void @use_v8i1(<8 x i1> %arg)
+  store <vscale x 16 x i1> %vec, ptr %ptr
+  ret void
+}
+
+;
+; Scalar return values
+;
+
+define void @dont_coalesce_res_i8(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_i8
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i8 @get_i8()
+  %vec = insertelement <vscale x 16 x i8> poison, i8 %res, i32 0
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_i16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_i16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i16 @get_i16()
+  %vec = insertelement <vscale x 8 x i16> poison, i16 %res, i32 0
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_i32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_i32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i32 @get_i32()
+  %vec = insertelement <vscale x 4 x i32> poison, i32 %res, i32 0
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_i64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_i64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i64 @get_i64()
+  %vec = insertelement <vscale x 2 x i64> poison, i64 %res, i32 0
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_f16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_f16
+; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call half @get_f16()
+  %vec = insertelement <vscale x 8 x half> poison, half %res, i32 0
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_f32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_f32
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call float @get_f32()
+  %vec = insertelement <vscale x 4 x float> poison, float %res, i32 0
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_f64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_f64
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call double @get_f64()
+  %vec = insertelement <vscale x 2 x double> poison, double %res, i32 0
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+;
+; Single-element vector result values
+;
+
+define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1i8
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x i8> @get_v1i8()
+  %elt = extractelement <1 x i8> %res, i32 0
+  %vec = insertelement <vscale x 16 x i8> poison, i8 %elt, i32 0
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1i16
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x i16> @get_v1i16()
+  %elt = extractelement <1 x i16> %res, i32 0
+  %vec = insertelement <vscale x 8 x i16> poison, i16 %elt, i32 0
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1i32
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x i32> @get_v1i32()
+  %elt = extractelement <1 x i32> %res, i32 0
+  %vec = insertelement <vscale x 4 x i32> poison, i32 %elt, i32 0
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1i64
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x i64> @get_v1i64()
+  %elt = extractelement <1 x i64> %res, i32 0
+  %vec = insertelement <vscale x 2 x i64> poison, i64 %elt, i32 0
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1f16
+; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x half> @get_v1f16()
+  %elt = extractelement <1 x half> %res, i32 0
+  %vec = insertelement <vscale x 8 x half> poison, half %elt, i32 0
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1f32
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x float> @get_v1f32()
+  %elt = extractelement <1 x float> %res, i32 0
+  %vec = insertelement <vscale x 4 x float> poison, float %elt, i32 0
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1f64
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x double> @get_v1f64()
+  %elt = extractelement <1 x double> %res, i32 0
+  %vec = insertelement <vscale x 2 x double> poison, double %elt, i32 0
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+;
+; Full vector result values
+;
+
+define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v16i8
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @get_v16i8()
+  %vec = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %res, i64 0)
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v8i16
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <8 x i16> @get_v8i16()
+  %vec = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> %res, i64 0)
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v4i32
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <4 x i32> @get_v4i32()
+  %vec = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> %res, i64 0)
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v2i64
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <2 x i64> @get_v2i64()
+  %vec = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %res, i64 0)
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v8f16
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @get_v8f16()
+  %vec = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %res, i64 0)
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v4f32
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @get_v4f32()
+  %vec = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %res, i64 0)
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v2f64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v2f64
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <2 x double> @get_v2f64()
+  %vec = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %res, i64 0)
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+declare half @get_f16()
+declare float @get_f32()
+declare double @get_f64()
+declare <1 x half> @get_v1f16()
+declare <1 x float> @get_v1f32()
+declare <1 x double> @get_v1f64()
+declare <8 x half> @get_v8f16()
+declare <4 x float> @get_v4f32()
+declare <2 x double> @get_v2f64()
+
+declare i8 @get_i8()
+declare i16 @get_i16()
+declare i32 @get_i32()
+declare i64 @get_i64()
+declare <1 x i8> @get_v1i8()
+declare <1 x i16> @get_v1i16()
+declare <1 x i32> @get_v1i32()
+declare <2 x i64> @get_v1i64()
+declare <16 x i8> @get_v16i8()
+declare <8 x i16> @get_v8i16()
+declare <4 x i32> @get_v4i32()
+declare <2 x i64> @get_v2i64()
+
+declare void @use_f16(half)
+declare void @use_f32(float)
+declare void @use_f64(double)
+declare void @use_v1f16(<1 x half>)
+declare void @use_v1f32(<1 x float>)
+declare void @use_v1f64(<1 x double>)
+declare void @use_v8f16(<8 x half>)
+declare void @use_v8bf16(<8 x bfloat>)
+declare void @use_v4f32(<4 x float>)
+declare void @use_v2f64(<2 x double>)
+
+declare void @use_i8(i8)
+declare void @use_i16(i16)
+declare void @use_i32(i32)
+declare void @use_i64(i64)
+declare void @use_v1i8(<1 x i8>)
+declare void @use_v1i16(<1 x i16>)
+declare void @use_v1i32(<1 x i32>)
+declare void @use_v1i64(<1 x i64>)
+declare void @use_v16i8(<16 x i8>)
+declare void @use_v8i16(<8 x i16>)
+declare void @use_v4i32(<4 x i32>)
+declare void @use_v2i64(<2 x i64>)
+declare void @use_v8i1(<8 x i1>)
+
+declare <vscale x 16 x i1> @llvm.vector.insert.nxv8i1.v8i1(<vscale x 16 x i1>, <8 x i1>, i64)
+declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
+declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64)
+
+attributes #0 = { nounwind "aarch64_pstate_sm_enabled" "target-features"="+sve,+sme"  }
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
index 55dc28f49bf157..93875549cffc86 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
@@ -249,11 +249,15 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
 ; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl cos
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 5d0c9127d3ebb2..296f2be9cfee5e 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -129,30 +129,37 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    tbz w19, #0, .LBB4_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB4_2:
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl normal_callee_vec_arg
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    tbz w19, #0, .LBB4_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB4_4:
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z1, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
@@ -462,7 +469,11 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-NEXT:    mov x9, x0
 ; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    ldp s4, s0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    stp s4, s0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    ldp d4, d0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    stp d4, d0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    tbz w19, #0, .LBB10_2
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index dd7d6470ad7b08..86918a59f3810e 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -313,9 +313,9 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    stp d0, d0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl cos
 ; CHECK-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
@@ -405,11 +405,11 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-NEXT:    stp d2, d3, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    stp s1, s0, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    stp d3, d2, [sp, #8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d2, d3, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp s1, s0, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d3, d2, [sp, #8] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index 45ca7844b06551..cf171f8ef5ed3a 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -22,9 +22,9 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
 ; CHECK-NEXT:    stp x30, x24, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    bl use_f
diff --git a/llvm/test/CodeGen/AArch64/vararg-tallcall.ll b/llvm/test/CodeGen/AArch64/vararg-tallcall.ll
index 2d6db1642247d7..812837639196e6 100644
--- a/llvm/test/CodeGen/AArch64/vararg-tallcall.ll
+++ b/llvm/test/CodeGen/AArch64/vararg-tallcall.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64ec-windows-msvc %s -o - | FileCheck %s --check-prefixes=CHECK-EC
 ; RUN: llc -global-isel -global-isel-abort=2 -verify-machineinstrs -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s
 ; RUN: llc -global-isel -global-isel-abort=2 -verify-machineinstrs -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
 
@@ -32,3 +33,10 @@ attributes #1 = { noinline optnone "thunk" }
 ; CHECK: ldr     x9, [x9]
 ; CHECK: mov     v0.16b, v16.16b
 ; CHECK: br      x9
+; CHECK-EC: mov     v7.16b, v0.16b
+; CHECK-EC: ldr     x9, [x0]
+; CHECK-EC: ldr     x11, [x9]
+; CHECK-EC: mov     v0.16b, v7.16b
+; CHECK-EC: add     x4, sp, #64
+; CHECK-EC: add     sp, sp, #64
+; CHECK-EC: br      x11
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 1b22e2f900ddb7..557aa010b3a7d9 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -489,3 +489,31 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
   %bitmask = bitcast <6 x i1> %cmp_result to i6
   ret i6 %bitmask
 }
+
+; Only apply the combine when casting a vector to a scalar.
+define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
+; CHECK-LABEL: vector_to_vector_cast:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    shl.16b v0, v0, #7
+; CHECK-NEXT:  Lloh36:
+; CHECK-NEXT:    adrp x8, lCPI20_0@PAGE
+; CHECK-NEXT:  Lloh37:
+; CHECK-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
+; CHECK-NEXT:    add x8, sp, #14
+; CHECK-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-NEXT:    zip1.16b v0, v0, v1
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    str h0, [sp, #14]
+; CHECK-NEXT:    ld1.b { v0 }[0], [x8]
+; CHECK-NEXT:    orr x8, x8, #0x1
+; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh37
+  %bc = bitcast <16 x i1> %arg to <2 x i8>
+  ret <2 x i8> %bc
+}
diff --git a/llvm/test/CodeGen/AArch64/win64-fpowi.ll b/llvm/test/CodeGen/AArch64/win64-fpowi.ll
new file mode 100644
index 00000000000000..3eb74f8394ec4e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/win64-fpowi.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-pc-windows-msvc19 -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-pc-windows-msvc19 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - | FileCheck %s
+
+define double @powi_f64(double %a, i32 %b) {
+; CHECK-LABEL: powi_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf d1, w0
+; CHECK-NEXT:    b pow
+entry:
+  %c = call double @llvm.powi.f64.i32(double %a, i32 %b)
+  ret double %c
+}
+
+define float @powi_f32(float %a, i32 %b) {
+; CHECK-LABEL: powi_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf s1, w0
+; CHECK-NEXT:    b powf
+entry:
+  %c = call float @llvm.powi.f32.i32(float %a, i32 %b)
+  ret float %c
+}
+
+define half @powi_f16(half %a, i32 %b) {
+; CHECK-LABEL: powi_f16:
+; CHECK:       .seh_proc powi_f16
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .seh_save_reg_x x30, 16
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    scvtf s1, w0
+; CHECK-NEXT:    bl powf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x30, 16
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .seh_endfunclet
+; CHECK-NEXT:    .seh_endproc
+entry:
+  %c = call half @llvm.powi.f16.i32(half %a, i32 %b)
+  ret half %c
+}
+
+define <2 x double> @powi_v2f64(<2 x double> %a, i32 %b) {
+; CHECK-LABEL: powi_v2f64:
+; CHECK:       .seh_proc powi_v2f64
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    .seh_stackalloc 48
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    .seh_save_reg x30, 32
+; CHECK-NEXT:    str d8, [sp, #40] // 8-byte Folded Spill
+; CHECK-NEXT:    .seh_save_freg d8, 40
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    scvtf d8, w0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov d0, v0.d[1]
+; CHECK-NEXT:    fmov d1, d8
+; CHECK-NEXT:    bl pow
+; CHECK-NEXT:    fmov d1, d8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    bl pow
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    ldr d8, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_freg d8, 40
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg x30, 32
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    .seh_stackalloc 48
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .seh_endfunclet
+; CHECK-NEXT:    .seh_endproc
+entry:
+  %c = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> %a, i32 %b)
+  ret <2 x double> %c
+}
+
+define <2 x float> @powi_v2f32(<2 x float> %a, i32 %b) {
+; CHECK-LABEL: powi_v2f32:
+; CHECK:       .seh_proc powi_v2f32
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    .seh_stackalloc 48
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    .seh_save_reg x30, 32
+; CHECK-NEXT:    str d8, [sp, #40] // 8-byte Folded Spill
+; CHECK-NEXT:    .seh_save_freg d8, 40
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    scvtf s8, w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    fmov s1, s8
+; CHECK-NEXT:    bl powf
+; CHECK-NEXT:    fmov s1, s8
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl powf
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    ldr d8, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_freg d8, 40
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg x30, 32
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    .seh_stackalloc 48
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .seh_endfunclet
+; CHECK-NEXT:    .seh_endproc
+entry:
+  %c = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> %a, i32 %b)
+  ret <2 x float> %c
+}
+
+declare <2 x double> @llvm.powi.v2f64.i32(<2 x double>, i32)
+declare <2 x float> @llvm.powi.v2f32.i32(<2 x float>, i32)
+declare double @llvm.powi.f64.i32(double, i32)
+declare float @llvm.powi.f32.i32(float, i32)
+declare half @llvm.powi.f16.i32(half, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
new file mode 100644
index 00000000000000..ea377531df2aea
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,504 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x half> %C
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <16 x half> %B
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <16 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %fneg.fabs.C = fneg <8 x float> %fabs.C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+  %fneg.fabs.C = fneg <8 x half> %fabs.C
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %el3 = extractelement <8 x float> %C, i32 3
+  %el3.fabs = call float @llvm.fabs.f32(float %el3)
+  %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
+  %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    flat_load_b128 v[12:15], v[8:9]
+; GFX12-NEXT:    flat_load_b128 v[16:19], v[8:9] offset:16
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x101
+; GFX12-NEXT:    v_and_b32_e32 v8, 0xffff, v12
+; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff, v14
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX12-NEXT:    v_and_b32_e32 v16, 0xffff, v18
+; GFX12-NEXT:    v_lshl_or_b32 v12, v13, 16, v8
+; GFX12-NEXT:    v_lshl_or_b32 v13, v15, 16, v9
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_lshl_or_b32 v14, v17, 16, v14
+; GFX12-NEXT:    v_lshl_or_b32 v15, v19, 16, v16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[10:11], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %C = load <16 x half>, ptr %Caddr
+  %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %fneg.C_shuffle = fneg <8 x half> %C_shuffle
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
new file mode 100644
index 00000000000000..6251dfdc392ebc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x42004200
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x3f803f80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x3fc03fc0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
+; GFX12-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GFX12-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
+; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
new file mode 100644
index 00000000000000..fe6d16bd8b5ead
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
new file mode 100644
index 00000000000000..c80d7a6d9a836e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX12-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX12-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX12-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
+  store <8 x half> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
+  store <8 x half> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX12-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
+  store <8 x i16> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
+  store <8 x i16> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+  store <8 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+  store <8 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v11, v[11:12], off
+; GFX12-NEXT:    v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX12-NEXT:    v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX12-NEXT:    v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX12-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[13:14], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+  store <8 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+  store <8 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
new file mode 100644
index 00000000000000..c4edc5b72b2fbb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
new file mode 100644
index 00000000000000..eafbfb6d1eeb53
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,459 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x half> %C
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %fneg.fabs.C = fneg <4 x float> %fabs.C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+  %fneg.fabs.C = fneg <4 x half> %fabs.C
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %el3 = extractelement <4 x float> %C, i32 3
+  %el3.fabs = call float @llvm.fabs.f32(float %el3)
+  %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
+  %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b128 v[8:11], v[4:5]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v8
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v10
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_lshl_or_b32 v4, v9, 16, v4
+; GFX12-NEXT:    v_lshl_or_b32 v5, v11, 16, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %C = load <8 x half>, ptr %Caddr
+  %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %fneg.C_shuffle = fneg <4 x half> %C_shuffle
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
new file mode 100644
index 00000000000000..c4d70fd5f0637f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -0,0 +1,430 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v9, s3
+; GFX12-NEXT:    v_mov_b32_e32 v8, s2
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v9, s3
+; GFX12-NEXT:    v_mov_b32_e32 v8, s2
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x42004200
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x3f803f80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x3fc03fc0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
new file mode 100644
index 00000000000000..7e1d09805df3f6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
new file mode 100644
index 00000000000000..b6f1828dce2576
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -0,0 +1,472 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX12-NEXT:    v_mov_b32_e32 v23, v9
+; GFX12-NEXT:    v_mov_b32_e32 v22, v8
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v9
+; GFX12-NEXT:    v_mov_b32_e32 v26, v8
+; GFX12-NEXT:    v_mov_b32_e32 v25, v7
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v31, v9
+; GFX12-NEXT:    v_mov_b32_e32 v30, v8
+; GFX12-NEXT:    v_mov_b32_e32 v29, v7
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX12-NEXT:    v_mov_b32_e32 v23, v9
+; GFX12-NEXT:    v_mov_b32_e32 v22, v8
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v9
+; GFX12-NEXT:    v_mov_b32_e32 v26, v8
+; GFX12-NEXT:    v_mov_b32_e32 v25, v7
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v31, v9
+; GFX12-NEXT:    v_mov_b32_e32 v30, v8
+; GFX12-NEXT:    v_mov_b32_e32 v29, v7
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX12-NEXT:    v_mov_b32_e32 v9, v7
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v7
+; GFX12-NEXT:    v_mov_b32_e32 v18, v6
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
+  store <4 x half> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
+  store <4 x half> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
+  store <4 x half> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
+  store <4 x half> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX12-NEXT:    v_mov_b32_e32 v9, v7
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v7
+; GFX12-NEXT:    v_mov_b32_e32 v18, v6
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
+  store <4 x i16> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
+  store <4 x i16> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
+  store <4 x i16> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
+  store <4 x i16> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
+  store <4 x i32> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
+  store <4 x i32> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v6, v[6:7], off
+; GFX12-NEXT:    v_mov_b32_e32 v15, v5
+; GFX12-NEXT:    v_mov_b32_e32 v14, v4
+; GFX12-NEXT:    v_mov_b32_e32 v13, v3
+; GFX12-NEXT:    v_mov_b32_e32 v12, v2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v16, v6
+; GFX12-NEXT:    v_mov_b32_e32 v15, v5
+; GFX12-NEXT:    v_mov_b32_e32 v14, v4
+; GFX12-NEXT:    v_mov_b32_e32 v13, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    global_store_b128 v[9:10], v[13:16], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
new file mode 100644
index 00000000000000..0d1871a18d4055
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 4aa97c57cbd9c2..5296ad3ab51d31 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1,9 +1,5446 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s
+
+; FIXME: GlobalISel missing the power-of-2 cases in legalization. https://github.com/llvm/llvm-project/issues/80671
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s
+; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s
 
-; SDAG-ERR: LLVM ERROR: unsupported libcall legalization
-; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_i128_vv)
 define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
-  %shl = sdiv i128 %lhs, %rhs
-  ret i128 %shl
+; GFX9-LABEL: v_sdiv_i128_vv:
+; GFX9:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v16, 31, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, v16, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, v16, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v16
+; GFX9-NEXT:    v_xor_b32_e32 v2, v16, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v1, v16, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
+; GFX9-NEXT:    v_xor_b32_e32 v3, v16, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, v2, v16, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, v3, v16, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v3, v17, v4
+; GFX9-NEXT:    v_xor_b32_e32 v2, v17, v5
+; GFX9-NEXT:    v_sub_co_u32_e32 v20, vcc, v3, v17
+; GFX9-NEXT:    v_xor_b32_e32 v0, v17, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v21, vcc, v2, v17, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, v17, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v17, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v17, vcc
+; GFX9-NEXT:    v_or_b32_e32 v3, v21, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v20, v0
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v3, v9, v11
+; GFX9-NEXT:    v_or_b32_e32 v2, v8, v10
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GFX9-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX9-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX9-NEXT:    v_min_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v3, v20
+; GFX9-NEXT:    v_add_u32_e32 v3, 32, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v21
+; GFX9-NEXT:    v_min_u32_e32 v3, v3, v4
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v3
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    v_ffbh_u32_e32 v5, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_ffbh_u32_e32 v3, v10
+; GFX9-NEXT:    v_add_u32_e32 v3, 32, v3
+; GFX9-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_ffbh_u32_e32 v5, v8
+; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v9
+; GFX9-NEXT:    v_min_u32_e32 v5, v5, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, 64, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v6, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v18, v16
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v19, v17
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_xor_b32_e32 v6, 0x7f, v2
+; GFX9-NEXT:    v_or_b32_e32 v7, v3, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v4
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v11, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v10, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB0_6
+; GFX9-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, 1, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v7, 0x7f, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v12, 64, v7
+; GFX9-NEXT:    v_or_b32_e32 v4, v23, v25
+; GFX9-NEXT:    v_or_b32_e32 v3, v22, v24
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, v[10:11]
+; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v12, v[8:9]
+; GFX9-NEXT:    v_sub_u32_e32 v2, 63, v2
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[3:4]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v4, v6, v13
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v12
+; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v7, v[8:9]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB0_5
+; GFX9-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX9-NEXT:    v_sub_u32_e32 v12, 64, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v22, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v12, v[10:11]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
+; GFX9-NEXT:    v_or_b32_e32 v12, v6, v12
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 64, v22
+; GFX9-NEXT:    v_or_b32_e32 v13, v7, v13
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v6, v[10:11]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v7, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v6, v12, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v22, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v12, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v6, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v26, vcc, -1, v20
+; GFX9-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v21, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v28, vcc, -1, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v1, vcc
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:  .LBB0_3: ; %udiv-do-while
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX9-NEXT:    v_or_b32_e32 v4, v14, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 31, v9
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v10, v10, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 31, v3
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v14
+; GFX9-NEXT:    v_sub_co_u32_e32 v14, vcc, v26, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v14, vcc, v27, v9, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v14, vcc, v28, v10, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v14, vcc, v29, v11, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v30, 31, v14
+; GFX9-NEXT:    v_and_b32_e32 v14, v30, v20
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v8, v14
+; GFX9-NEXT:    v_and_b32_e32 v14, v30, v21
+; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v9, v14, vcc
+; GFX9-NEXT:    v_and_b32_e32 v14, v30, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v14, vcc
+; GFX9-NEXT:    v_and_b32_e32 v14, v30, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v14, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, -1, v22
+; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, -1, v24, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v25, vcc
+; GFX9-NEXT:    v_or_b32_e32 v5, v15, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v14, v22, v24
+; GFX9-NEXT:    v_or_b32_e32 v15, v23, v25
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT:    v_or3_b32 v2, v2, v6, v12
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v30
+; GFX9-NEXT:    v_mov_b32_e32 v15, v7
+; GFX9-NEXT:    v_or3_b32 v3, v3, 0, v13
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX9-NEXT:  ; %bb.4: ; %Flow
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:  .LBB0_5: ; %Flow2
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
+; GFX9-NEXT:    v_or3_b32 v13, v3, 0, v13
+; GFX9-NEXT:    v_or3_b32 v12, v2, v4, v12
+; GFX9-NEXT:    v_or_b32_e32 v7, v7, v1
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v0
+; GFX9-NEXT:  .LBB0_6: ; %Flow3
+; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    v_xor_b32_e32 v2, v17, v16
+; GFX9-NEXT:    v_xor_b32_e32 v3, v19, v18
+; GFX9-NEXT:    v_xor_b32_e32 v0, v6, v2
+; GFX9-NEXT:    v_xor_b32_e32 v1, v7, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_xor_b32_e32 v5, v12, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v4, v13, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_sdiv_i128_vv:
+; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[11:12], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v6, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v6, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_xor_b32_e64 v15, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v4
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v5, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_xor_b32_e64 v11, v3, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v8
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v3, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v8, v3, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v1, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    v_or_b32_e64 v15, v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v9, v3, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v15
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7]
+; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    s_mov_b32 s9, 32
+; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX9-O0-NEXT:    v_min_u32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    s_mov_b32 s8, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    s_mov_b64 s[10:11], 64
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT:    s_mov_b32 s12, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    s_mov_b32 s14, s11
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[12:13], v8, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[12:13]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
+; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
+; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    s_mov_b32 s8, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    s_mov_b32 s10, s11
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[8:9], v11, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s10
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    s_mov_b32 s10, s6
+; GFX9-O0-NEXT:    s_mov_b32 s11, s7
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v7, 1
+; GFX9-O0-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX9-O0-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    s_mov_b32 s14, s13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v7, v7, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v2, v5, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_3
+; GFX9-O0-NEXT:    s_branch .LBB0_8
+; GFX9-O0-NEXT:  .LBB0_1: ; %Flow
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB0_5
+; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 3
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB0_9
+; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[9:10], s4, v[9:10]
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
+; GFX9-O0-NEXT:    v_or3_b32 v4, v4, v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB0_3
+; GFX9-O0-NEXT:  .LBB0_5: ; %Flow1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB0_4
+; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
+; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v30
+; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], s5, v[23:24]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v24
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v23, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[2:3]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v29
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[6:7]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v30
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v27
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v25
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 1
+; GFX9-O0-NEXT:    s_mov_b32 s8, s5
+; GFX9-O0-NEXT:    v_and_b32_e64 v12, v7, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_and_b32_e64 v14, v11, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v23, v7, v23
+; GFX9-O0-NEXT:    v_and_b32_e64 v21, v11, v21
+; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v20
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v23
+; GFX9-O0-NEXT:    v_and_b32_e64 v23, v11, v19
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v20
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v19, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v21
+; GFX9-O0-NEXT:    v_or_b32_e64 v19, v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v17, v17, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v19
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13]
+; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
+; GFX9-O0-NEXT:    s_branch .LBB0_1
+; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[21:22]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    s_mov_b32 s6, 64
+; GFX9-O0-NEXT:    v_sub_u32_e64 v12, s6, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], v12, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v24
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v4, s6
+; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s6
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[23:24], v5, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s6, 0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b32 s8, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v12, v12, v15, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    s_mov_b32 s8, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v15, v17, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v15, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v13
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB0_6
+; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    s_mov_b32 s4, s7
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b32 s8, s6
+; GFX9-O0-NEXT:    s_mov_b32 s9, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v9, vcc, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
+; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    s_mov_b32 s4, 64
+; GFX9-O0-NEXT:    v_sub_u32_e64 v14, s4, v3
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v14, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v16
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT:    s_mov_b32 s10, 63
+; GFX9-O0-NEXT:    v_sub_u32_e64 v4, s10, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s10, 0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v3, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[10:11]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[7:8], v3, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v4, v7, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_5
+; GFX9-O0-NEXT:    s_branch .LBB0_7
+; GFX9-O0-NEXT:  .LBB0_9: ; %udiv-end
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v6, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v0, v0, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v8
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[5:6], s4, v[5:6]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+  %div = sdiv i128 %lhs, %rhs
+  ret i128 %div
+}
+
+define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
+; GFX9-LABEL: v_udiv_i128_vv:
+; GFX9:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v9, v5, v7
+; GFX9-NEXT:    v_or_b32_e32 v8, v4, v6
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v9, v1, v3
+; GFX9-NEXT:    v_or_b32_e32 v8, v0, v2
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v6
+; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_ffbh_u32_e32 v9, v7
+; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX9-NEXT:    v_ffbh_u32_e32 v9, v4
+; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX9-NEXT:    v_ffbh_u32_e32 v10, v5
+; GFX9-NEXT:    v_min_u32_e32 v9, v9, v10
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 64, v9
+; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_ffbh_u32_e32 v11, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX9-NEXT:    v_ffbh_u32_e32 v9, v2
+; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX9-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX9-NEXT:    v_ffbh_u32_e32 v11, v0
+; GFX9-NEXT:    v_add_u32_e32 v11, 32, v11
+; GFX9-NEXT:    v_ffbh_u32_e32 v12, v1
+; GFX9-NEXT:    v_min_u32_e32 v11, v11, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, 64, v11
+; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v12, 0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v12, vcc, v8, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v13, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v14, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13]
+; GFX9-NEXT:    v_or_b32_e32 v10, v13, v15
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    v_xor_b32_e32 v9, 0x7f, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v14
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v3, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v1, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v0, 0, s[4:5]
+; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB1_6
+; GFX9-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX9-NEXT:    v_add_co_u32_e32 v18, vcc, 1, v12
+; GFX9-NEXT:    v_addc_co_u32_e32 v19, vcc, 0, v13, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v14, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v15, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v15, 0x7f, v12
+; GFX9-NEXT:    v_or_b32_e32 v9, v19, v21
+; GFX9-NEXT:    v_or_b32_e32 v8, v18, v20
+; GFX9-NEXT:    v_sub_u32_e32 v13, 64, v15
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[13:14], v13, v[0:1]
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT:    v_sub_u32_e32 v8, 63, v12
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v8, v[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v11, v11, v14
+; GFX9-NEXT:    v_or_b32_e32 v10, v10, v13
+; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v15
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v15
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, v11, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB1_5
+; GFX9-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX9-NEXT:    v_sub_u32_e32 v14, 64, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v18, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[14:15], v14, v[2:3]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX9-NEXT:    v_or_b32_e32 v14, v12, v14
+; GFX9-NEXT:    v_subrev_u32_e32 v12, 64, v18
+; GFX9-NEXT:    v_or_b32_e32 v15, v13, v15
+; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v12, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v18, v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, -1, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, -1, v6, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v7, vcc
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:  .LBB1_3: ; %udiv-do-while
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_lshlrev_b64 v[26:27], 1, v[10:11]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 31, v11
+; GFX9-NEXT:    v_or_b32_e32 v10, v16, v26
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 31, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v11, v17, v27
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 31, v9
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX9-NEXT:    v_sub_co_u32_e32 v16, vcc, v22, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v16, vcc, v23, v1, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v16, vcc, v24, v2, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v16, vcc, v25, v3, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v26, 31, v16
+; GFX9-NEXT:    v_and_b32_e32 v16, v26, v4
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v16
+; GFX9-NEXT:    v_and_b32_e32 v16, v26, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v16, vcc
+; GFX9-NEXT:    v_and_b32_e32 v16, v26, v6
+; GFX9-NEXT:    v_or3_b32 v8, v8, v12, v14
+; GFX9-NEXT:    v_and_b32_e32 v12, v26, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v16, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v12, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v18, vcc, -1, v18
+; GFX9-NEXT:    v_addc_co_u32_e32 v19, vcc, -1, v19, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, -1, v20, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v21, vcc
+; GFX9-NEXT:    v_or_b32_e32 v16, v18, v20
+; GFX9-NEXT:    v_or_b32_e32 v17, v19, v21
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v26
+; GFX9-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-NEXT:    v_or3_b32 v9, v9, 0, v15
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX9-NEXT:  ; %bb.4: ; %Flow
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:  .LBB1_5: ; %Flow2
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[10:11]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[8:9]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v11
+; GFX9-NEXT:    v_or3_b32 v8, v3, 0, v15
+; GFX9-NEXT:    v_or3_b32 v9, v2, v4, v14
+; GFX9-NEXT:    v_or_b32_e32 v10, v13, v1
+; GFX9-NEXT:    v_or_b32_e32 v11, v12, v0
+; GFX9-NEXT:  .LBB1_6: ; %Flow3
+; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_udiv_i128_vv:
+; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v1, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    v_or_b32_e64 v15, v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v9, v3, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v15
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7]
+; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    s_mov_b32 s9, 32
+; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX9-O0-NEXT:    v_min_u32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    s_mov_b32 s8, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    s_mov_b64 s[10:11], 64
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT:    s_mov_b32 s12, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    s_mov_b32 s14, s11
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[12:13], v8, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[12:13]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
+; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
+; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    s_mov_b32 s8, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    s_mov_b32 s10, s11
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[8:9], v11, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s10
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    s_mov_b32 s10, s6
+; GFX9-O0-NEXT:    s_mov_b32 s11, s7
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v7, 1
+; GFX9-O0-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX9-O0-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    s_mov_b32 s14, s13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v7, v7, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v2, v5, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_3
+; GFX9-O0-NEXT:    s_branch .LBB1_8
+; GFX9-O0-NEXT:  .LBB1_1: ; %Flow
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB1_5
+; GFX9-O0-NEXT:  .LBB1_3: ; %Flow2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 3
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB1_9
+; GFX9-O0-NEXT:  .LBB1_4: ; %udiv-loop-exit
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[9:10], s4, v[9:10]
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
+; GFX9-O0-NEXT:    v_or3_b32 v4, v4, v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB1_3
+; GFX9-O0-NEXT:  .LBB1_5: ; %Flow1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB1_4
+; GFX9-O0-NEXT:  .LBB1_6: ; %udiv-do-while
+; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v30
+; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], s5, v[23:24]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v24
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v23, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[2:3]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v29
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[6:7]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v30
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v27
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v25
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 1
+; GFX9-O0-NEXT:    s_mov_b32 s8, s5
+; GFX9-O0-NEXT:    v_and_b32_e64 v12, v7, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_and_b32_e64 v14, v11, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v23, v7, v23
+; GFX9-O0-NEXT:    v_and_b32_e64 v21, v11, v21
+; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v20
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v23
+; GFX9-O0-NEXT:    v_and_b32_e64 v23, v11, v19
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v20
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v19, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v21
+; GFX9-O0-NEXT:    v_or_b32_e64 v19, v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v17, v17, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v19
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13]
+; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execnz .LBB1_6
+; GFX9-O0-NEXT:    s_branch .LBB1_1
+; GFX9-O0-NEXT:  .LBB1_7: ; %udiv-preheader
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[21:22]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    s_mov_b32 s6, 64
+; GFX9-O0-NEXT:    v_sub_u32_e64 v12, s6, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], v12, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v24
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v4, s6
+; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s6
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[23:24], v5, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s6, 0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b32 s8, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v12, v12, v15, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    s_mov_b32 s8, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v15, v17, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v15, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v13
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB1_6
+; GFX9-O0-NEXT:  .LBB1_8: ; %udiv-bb1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    s_mov_b32 s4, s7
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b32 s8, s6
+; GFX9-O0-NEXT:    s_mov_b32 s9, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v9, vcc, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
+; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    s_mov_b32 s4, 64
+; GFX9-O0-NEXT:    v_sub_u32_e64 v14, s4, v3
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v14, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v16
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT:    s_mov_b32 s10, 63
+; GFX9-O0-NEXT:    v_sub_u32_e64 v4, s10, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s10, 0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v3, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[10:11]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[7:8], v3, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v4, v7, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_5
+; GFX9-O0-NEXT:    s_branch .LBB1_7
+; GFX9-O0-NEXT:  .LBB1_9: ; %udiv-end
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[5:6]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+  %div = udiv i128 %lhs, %rhs
+  ret i128 %div
+}
+
+define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
+; GFX9-LABEL: v_srem_i128_vv:
+; GFX9:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v20, 31, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v20
+; GFX9-NEXT:    v_xor_b32_e32 v10, v2, v20
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v20
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v20
+; GFX9-NEXT:    v_xor_b32_e32 v9, v3, v20
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v20, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v10, v20, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v9, v20, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v8
+; GFX9-NEXT:    v_sub_co_u32_e32 v23, vcc, v4, v8
+; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v21, vcc, v5, v8, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v8, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v7, v8, vcc
+; GFX9-NEXT:    v_or_b32_e32 v7, v21, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v23, v4
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v7, v3, v1
+; GFX9-NEXT:    v_or_b32_e32 v6, v2, v0
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[6:7]
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, 32, v6
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v5
+; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v23
+; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v21
+; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 64, v7
+; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_ffbh_u32_e32 v9, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v0
+; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_min_u32_e32 v7, v7, v9
+; GFX9-NEXT:    v_ffbh_u32_e32 v9, v2
+; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX9-NEXT:    v_ffbh_u32_e32 v10, v3
+; GFX9-NEXT:    v_min_u32_e32 v9, v9, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 64, v9
+; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v10, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v8, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v13, v7, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v22, v20
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX9-NEXT:    v_xor_b32_e32 v10, 0x7f, v6
+; GFX9-NEXT:    v_or_b32_e32 v12, v10, v8
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v1, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v0, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v3, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[4:5]
+; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX9-NEXT:    v_add_co_u32_e32 v24, vcc, 1, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v26, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v13, 0x7f, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v27, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v11, 64, v13
+; GFX9-NEXT:    v_or_b32_e32 v8, v25, v27
+; GFX9-NEXT:    v_or_b32_e32 v7, v24, v26
+; GFX9-NEXT:    v_lshlrev_b64 v[9:10], v13, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[11:12], v11, v[2:3]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 63, v6
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v8, v10, v12
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v13
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v13
+; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v13, v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v1, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, v13, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[4:5]
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[0:1]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX9-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v24
+; GFX9-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v9, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v8, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v10, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v28, vcc, -1, v23
+; GFX9-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v21, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v30, vcc, -1, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v31, vcc, -1, v5, vcc
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v19, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:  .LBB2_3: ; %udiv-do-while
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 31, v15
+; GFX9-NEXT:    v_lshlrev_b64 v[14:15], 1, v[14:15]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 31, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 31, v13
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], 1, v[16:17]
+; GFX9-NEXT:    v_or_b32_e32 v14, v14, v33
+; GFX9-NEXT:    v_or3_b32 v6, v6, v8, v10
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v28, v14
+; GFX9-NEXT:    v_or_b32_e32 v16, v16, v32
+; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, v29, v15, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, v30, v16, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[12:13], 1, v[12:13]
+; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, v31, v17, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX9-NEXT:    v_or_b32_e32 v12, v18, v12
+; GFX9-NEXT:    v_and_b32_e32 v18, v8, v23
+; GFX9-NEXT:    v_or_b32_e32 v13, v19, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, v8, v21
+; GFX9-NEXT:    v_sub_co_u32_e32 v14, vcc, v14, v18
+; GFX9-NEXT:    v_and_b32_e32 v32, v8, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v15, vcc, v15, v19, vcc
+; GFX9-NEXT:    v_and_b32_e32 v33, v8, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v16, vcc, v16, v32, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v17, vcc, v17, v33, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v24, vcc, -1, v24
+; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v25, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v26, vcc, -1, v26, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v27, vcc
+; GFX9-NEXT:    v_or_b32_e32 v18, v24, v26
+; GFX9-NEXT:    v_or_b32_e32 v19, v25, v27
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    v_mov_b32_e32 v19, v9
+; GFX9-NEXT:    v_or3_b32 v7, v7, 0, v11
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v18, v8
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB2_3
+; GFX9-NEXT:  ; %bb.4: ; %Flow
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:  .LBB2_5: ; %Flow2
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    v_lshlrev_b64 v[14:15], 1, v[12:13]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 31, v13
+; GFX9-NEXT:    v_or3_b32 v11, v7, 0, v11
+; GFX9-NEXT:    v_or3_b32 v12, v6, v12, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, v9, v15
+; GFX9-NEXT:    v_or_b32_e32 v13, v8, v14
+; GFX9-NEXT:  .LBB2_6: ; %Flow3
+; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    v_mul_lo_u32 v16, v13, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v21, v13, v[14:15]
+; GFX9-NEXT:    v_mul_lo_u32 v9, v10, v4
+; GFX9-NEXT:    v_mul_lo_u32 v11, v11, v23
+; GFX9-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-NEXT:    v_mov_b32_e32 v14, v15
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v23, v10, v[13:14]
+; GFX9-NEXT:    v_add3_u32 v8, v8, v16, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8]
+; GFX9-NEXT:    v_mov_b32_e32 v8, v14
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v8
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v12, v12, v21
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9]
+; GFX9-NEXT:    v_add3_u32 v4, v11, v7, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, v13
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v20
+; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v20
+; GFX9-NEXT:    v_xor_b32_e32 v4, v1, v22
+; GFX9-NEXT:    v_xor_b32_e32 v1, v3, v22
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v20
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v22, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v20, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v22, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_srem_i128_vv:
+; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[12:13], s4, v[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[6:7], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v11, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v15, v4, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v2, v2, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v12
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v11, v12, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v10, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v6
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v5, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v1, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    v_or_b32_e64 v15, v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v9, v3, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v15
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7]
+; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    s_mov_b32 s9, 32
+; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX9-O0-NEXT:    v_min_u32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    s_mov_b32 s8, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    s_mov_b64 s[10:11], 64
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT:    s_mov_b32 s12, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    s_mov_b32 s14, s11
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[12:13], v8, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[12:13]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
+; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
+; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    s_mov_b32 s8, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    s_mov_b32 s10, s11
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[8:9], v11, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s10
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    s_mov_b32 s10, s6
+; GFX9-O0-NEXT:    s_mov_b32 s11, s7
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v7, 1
+; GFX9-O0-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX9-O0-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    s_mov_b32 s14, s13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v7, v7, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v2, v5, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB2_3
+; GFX9-O0-NEXT:    s_branch .LBB2_8
+; GFX9-O0-NEXT:  .LBB2_1: ; %Flow
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB2_5
+; GFX9-O0-NEXT:  .LBB2_3: ; %Flow2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 3
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB2_9
+; GFX9-O0-NEXT:  .LBB2_4: ; %udiv-loop-exit
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[9:10], s4, v[9:10]
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
+; GFX9-O0-NEXT:    v_or3_b32 v4, v4, v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB2_3
+; GFX9-O0-NEXT:  .LBB2_5: ; %Flow1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB2_4
+; GFX9-O0-NEXT:  .LBB2_6: ; %udiv-do-while
+; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v30
+; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], s5, v[23:24]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v24
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v23, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[2:3]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v29
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[6:7]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v30
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v27
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v25
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 1
+; GFX9-O0-NEXT:    s_mov_b32 s8, s5
+; GFX9-O0-NEXT:    v_and_b32_e64 v12, v7, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_and_b32_e64 v14, v11, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v23, v7, v23
+; GFX9-O0-NEXT:    v_and_b32_e64 v21, v11, v21
+; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v20
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v23
+; GFX9-O0-NEXT:    v_and_b32_e64 v23, v11, v19
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v20
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v19, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v21
+; GFX9-O0-NEXT:    v_or_b32_e64 v19, v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v17, v17, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v19
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13]
+; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execnz .LBB2_6
+; GFX9-O0-NEXT:    s_branch .LBB2_1
+; GFX9-O0-NEXT:  .LBB2_7: ; %udiv-preheader
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[21:22]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    s_mov_b32 s6, 64
+; GFX9-O0-NEXT:    v_sub_u32_e64 v12, s6, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], v12, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v24
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v4, s6
+; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s6
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[23:24], v5, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s6, 0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b32 s8, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v12, v12, v15, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    s_mov_b32 s8, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v15, v17, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v15, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v13
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB2_6
+; GFX9-O0-NEXT:  .LBB2_8: ; %udiv-bb1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    s_mov_b32 s4, s7
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b32 s8, s6
+; GFX9-O0-NEXT:    s_mov_b32 s9, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v9, vcc, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
+; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    s_mov_b32 s4, 64
+; GFX9-O0-NEXT:    v_sub_u32_e64 v14, s4, v3
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v14, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v16
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT:    s_mov_b32 s10, 63
+; GFX9-O0-NEXT:    v_sub_u32_e64 v4, s10, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s10, 0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v3, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[10:11]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[7:8], v3, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v4, v7, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9-O0-NEXT:    s_branch .LBB2_7
+; GFX9-O0-NEXT:  .LBB2_9: ; %udiv-end
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[5:6]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v17
+; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v1, v0
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[17:18], s4, v[17:18]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mul_lo_u32 v2, v5, v2
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[17:18], s[6:7], v5, v0, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v18
+; GFX9-O0-NEXT:    v_add3_u32 v2, v0, v2, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    s_mov_b32 s5, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v18
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v17
+; GFX9-O0-NEXT:    v_or_b32_e64 v17, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v0
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v11
+; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v2, v6
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s4, v[11:12]
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v19
+; GFX9-O0-NEXT:    v_mul_lo_u32 v11, v11, v0
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v2, v0, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v20
+; GFX9-O0-NEXT:    v_add3_u32 v2, v2, v3, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v18
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v17, s[6:7], v11, v12
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v2
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v6, v1, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v19
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v20
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v12
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[19:20], s4, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v6, v5, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v19
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v20
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v21
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[19:20], s4, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v21
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v23, v11, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v6
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v0, v5, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v5, s[6:7], v5, v20
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v19, s[6:7], v6, v19, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v6
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0xffffffff
+; GFX9-O0-NEXT:    s_mov_b32 s8, s7
+; GFX9-O0-NEXT:    v_and_b32_e64 v19, v19, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v5
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    v_and_b32_e64 v21, v20, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v0, v1, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v19
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v20
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v1
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[19:20], s4, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v23
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v23, v1, v19
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v0, s[6:7], v0, v20
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v19, s[6:7], v1, v19, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v19
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[21:22], s4, v[0:1]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[5:6], s4, v[5:6]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v22
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v19, s[6:7], v19, v20
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v5, v6, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v20
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v19, s[6:7], v5, v6
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v18
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[6:7], v2, v6
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v12
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v11, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v6, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_xor_b32_e64 v0, v0, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v8
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[5:6], s4, v[5:6]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+  %div = srem i128 %lhs, %rhs
+  ret i128 %div
+}
+
+define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
+; GFX9-LABEL: v_urem_i128_vv:
+; GFX9:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v9, v5, v7
+; GFX9-NEXT:    v_or_b32_e32 v8, v4, v6
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v9, v1, v3
+; GFX9-NEXT:    v_or_b32_e32 v8, v0, v2
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v6
+; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_ffbh_u32_e32 v9, v7
+; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX9-NEXT:    v_ffbh_u32_e32 v9, v4
+; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX9-NEXT:    v_ffbh_u32_e32 v10, v5
+; GFX9-NEXT:    v_min_u32_e32 v9, v9, v10
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 64, v9
+; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_ffbh_u32_e32 v11, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX9-NEXT:    v_ffbh_u32_e32 v9, v2
+; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX9-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX9-NEXT:    v_ffbh_u32_e32 v11, v0
+; GFX9-NEXT:    v_add_u32_e32 v11, 32, v11
+; GFX9-NEXT:    v_ffbh_u32_e32 v12, v1
+; GFX9-NEXT:    v_min_u32_e32 v11, v11, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, 64, v11
+; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v8, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v10, v12, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v11, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX9-NEXT:    v_xor_b32_e32 v12, 0x7f, v8
+; GFX9-NEXT:    v_or_b32_e32 v13, v9, v11
+; GFX9-NEXT:    v_or_b32_e32 v12, v12, v10
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v3, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v1, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v0, 0, s[4:5]
+; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB3_6
+; GFX9-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, 1, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v15, 0x7f, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v11, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v13, 64, v15
+; GFX9-NEXT:    v_or_b32_e32 v10, v23, v25
+; GFX9-NEXT:    v_or_b32_e32 v9, v22, v24
+; GFX9-NEXT:    v_lshlrev_b64 v[11:12], v15, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[13:14], v13, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v8, 63, v8
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v8, v[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v10, v12, v14
+; GFX9-NEXT:    v_or_b32_e32 v11, v11, v13
+; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v15
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v15
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, v11, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB3_5
+; GFX9-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX9-NEXT:    v_sub_u32_e32 v14, 64, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v22, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[14:15], v14, v[2:3]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
+; GFX9-NEXT:    v_or_b32_e32 v14, v12, v14
+; GFX9-NEXT:    v_subrev_u32_e32 v12, 64, v22
+; GFX9-NEXT:    v_or_b32_e32 v15, v13, v15
+; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v12, v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v13, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v12, v14, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[12:13], v22, v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v14, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v12, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v26, vcc, -1, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v28, vcc, -1, v6, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v20, 0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v7, vcc
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v21, 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:  .LBB3_3: ; %udiv-do-while
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 31, v11
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX9-NEXT:    v_lshlrev_b64 v[18:19], 1, v[18:19]
+; GFX9-NEXT:    v_or_b32_e32 v10, v20, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 31, v17
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], 1, v[16:17]
+; GFX9-NEXT:    v_or_b32_e32 v18, v18, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 31, v9
+; GFX9-NEXT:    v_or_b32_e32 v16, v16, v20
+; GFX9-NEXT:    v_sub_co_u32_e32 v20, vcc, v26, v16
+; GFX9-NEXT:    v_subb_co_u32_e32 v20, vcc, v27, v17, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v20, vcc, v28, v18, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v20, vcc, v29, v19, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v30, 31, v20
+; GFX9-NEXT:    v_and_b32_e32 v20, v30, v4
+; GFX9-NEXT:    v_sub_co_u32_e32 v16, vcc, v16, v20
+; GFX9-NEXT:    v_and_b32_e32 v20, v30, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v17, vcc, v17, v20, vcc
+; GFX9-NEXT:    v_and_b32_e32 v20, v30, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v18, vcc, v18, v20, vcc
+; GFX9-NEXT:    v_and_b32_e32 v20, v30, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v19, vcc, v19, v20, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, -1, v22
+; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, -1, v24, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v25, vcc
+; GFX9-NEXT:    v_or_b32_e32 v11, v21, v11
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v20, v22, v24
+; GFX9-NEXT:    v_or_b32_e32 v21, v23, v25
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; GFX9-NEXT:    v_or3_b32 v8, v8, v12, v14
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v30
+; GFX9-NEXT:    v_mov_b32_e32 v21, v13
+; GFX9-NEXT:    v_or3_b32 v9, v9, 0, v15
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v20, v12
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB3_3
+; GFX9-NEXT:  ; %bb.4: ; %Flow
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:  .LBB3_5: ; %Flow2
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], 1, v[10:11]
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 31, v11
+; GFX9-NEXT:    v_or3_b32 v15, v9, 0, v15
+; GFX9-NEXT:    v_or3_b32 v14, v8, v10, v14
+; GFX9-NEXT:    v_or_b32_e32 v13, v13, v17
+; GFX9-NEXT:    v_or_b32_e32 v12, v12, v16
+; GFX9-NEXT:  .LBB3_6: ; %Flow3
+; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    v_mul_lo_u32 v19, v12, v7
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v16, v8
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v12, v[16:17]
+; GFX9-NEXT:    v_mul_lo_u32 v18, v13, v6
+; GFX9-NEXT:    v_mul_lo_u32 v16, v15, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, v12
+; GFX9-NEXT:    v_mov_b32_e32 v12, v17
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12]
+; GFX9-NEXT:    v_add3_u32 v10, v10, v19, v18
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v4, v[9:10]
+; GFX9-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-NEXT:    v_mul_lo_u32 v10, v14, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v14, vcc, v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[14:15]
+; GFX9-NEXT:    v_add3_u32 v6, v16, v9, v10
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_urem_i128_vv:
+; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v1, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    v_or_b32_e64 v15, v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v9, v3, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v15
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7]
+; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX9-O0-NEXT:    s_mov_b32 s9, 32
+; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX9-O0-NEXT:    v_min_u32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    s_mov_b32 s8, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    s_mov_b64 s[10:11], 64
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT:    s_mov_b32 s12, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    s_mov_b32 s14, s11
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[12:13], v8, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13]
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[12:13]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
+; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
+; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    s_mov_b32 s8, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT:    s_mov_b32 s10, s11
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[8:9], v11, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s10
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9]
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    s_mov_b32 s10, s6
+; GFX9-O0-NEXT:    s_mov_b32 s11, s7
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v6, v7, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[8:9], v7, 1
+; GFX9-O0-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX9-O0-NEXT:    s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    s_mov_b32 s14, s13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
+; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v7, v7, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v2, v5, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB3_3
+; GFX9-O0-NEXT:    s_branch .LBB3_8
+; GFX9-O0-NEXT:  .LBB3_1: ; %Flow
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB3_5
+; GFX9-O0-NEXT:  .LBB3_3: ; %Flow2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 3
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB3_9
+; GFX9-O0-NEXT:  .LBB3_4: ; %udiv-loop-exit
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[9:10], s4, v[9:10]
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
+; GFX9-O0-NEXT:    v_or3_b32 v4, v4, v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB3_3
+; GFX9-O0-NEXT:  .LBB3_5: ; %Flow1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB3_4
+; GFX9-O0-NEXT:  .LBB3_6: ; %udiv-do-while
+; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v30
+; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], s5, v[23:24]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v24
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v23, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[2:3]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v29
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[29:30], s5, v[6:7]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v30
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v27
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v25
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 1
+; GFX9-O0-NEXT:    s_mov_b32 s8, s5
+; GFX9-O0-NEXT:    v_and_b32_e64 v12, v7, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_and_b32_e64 v14, v11, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v23, v7, v23
+; GFX9-O0-NEXT:    v_and_b32_e64 v21, v11, v21
+; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v20
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v23
+; GFX9-O0-NEXT:    v_and_b32_e64 v23, v11, v19
+; GFX9-O0-NEXT:    ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v22
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v20
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v19, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v21
+; GFX9-O0-NEXT:    v_or_b32_e64 v19, v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v17, v17, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v19
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13]
+; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execnz .LBB3_6
+; GFX9-O0-NEXT:    s_branch .LBB3_1
+; GFX9-O0-NEXT:  .LBB3_7: ; %udiv-preheader
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[21:22]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT:    s_mov_b32 s6, 64
+; GFX9-O0-NEXT:    v_sub_u32_e64 v12, s6, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[23:24], v12, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v24
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v4, s6
+; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s6
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[23:24], v5, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v24
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s6, 0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b32 s8, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v12, v12, v15, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    s_mov_b32 s8, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v17, vcc, v15, v17, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v15, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v13
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_branch .LBB3_6
+; GFX9-O0-NEXT:  .LBB3_8: ; %udiv-bb1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
+; GFX9-O0-NEXT:    s_mov_b32 s5, s6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    s_mov_b32 s4, s7
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b32 s8, s6
+; GFX9-O0-NEXT:    s_mov_b32 s9, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v9, vcc, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
+; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    s_mov_b32 s4, 64
+; GFX9-O0-NEXT:    v_sub_u32_e64 v14, s4, v3
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v14, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v16
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT:    s_mov_b32 s10, 63
+; GFX9-O0-NEXT:    v_sub_u32_e64 v4, s10, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s10, 0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v3, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[10:11]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[7:8], v3, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v4, v7, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[1:2], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB3_5
+; GFX9-O0-NEXT:    s_branch .LBB3_7
+; GFX9-O0-NEXT:  .LBB3_9: ; %udiv-end
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v13
+; GFX9-O0-NEXT:    v_mul_lo_u32 v5, v6, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], s4, v[13:14]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v7, v3
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[13:14], s[6:7], v7, v2, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-O0-NEXT:    v_add3_u32 v2, v2, v3, v5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[17:18], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    s_mov_b32 s5, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_or_b32_e64 v13, v3, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[15:16]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v11
+; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v2, v8
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s4, v[11:12]
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v15
+; GFX9-O0-NEXT:    v_mul_lo_u32 v11, v11, v5
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[15:16], s[6:7], v2, v5, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-O0-NEXT:    v_add3_u32 v2, v2, v3, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v16
+; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v14
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v13, s[6:7], v11, v12
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[15:16], s[6:7], v8, v6, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v15
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[15:16], s4, v[15:16]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v16
+; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[15:16], s[6:7], v8, v7, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v17
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[15:16], s4, v[15:16]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX9-O0-NEXT:    v_or_b32_e64 v8, v8, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v15
+; GFX9-O0-NEXT:    v_or_b32_e64 v19, v11, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v8
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v5, v7, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v18
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v7, s[6:7], v7, v16
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v15, s[6:7], v8, v15, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v8
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0xffffffff
+; GFX9-O0-NEXT:    s_mov_b32 s8, s7
+; GFX9-O0-NEXT:    v_and_b32_e64 v15, v15, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v7
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    v_and_b32_e64 v17, v16, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[15:16], s[6:7], v5, v6, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v15
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v6
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[15:16], s4, v[15:16]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v16
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v19
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v19, v6, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v18
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v5, s[6:7], v5, v16
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v15, s[6:7], v6, v15, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v15
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[17:18], s4, v[5:6]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v15, s[6:7], v15, v16
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v7, v8, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v15, s[6:7], v7, v8
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[6:7], v2, v8
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v3, v7, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], s4, v[5:6]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v8
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s4, v[7:8]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[5:6], s4, v[5:6]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    ; kill: killed $vgpr4
+; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+  %div = urem i128 %lhs, %rhs
+  ret i128 %div
+}
+
+define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
+; GFX9-LABEL: v_sdiv_i128_v_pow2k:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 31, v[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_ashrrev_i64 v[2:3], 33, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_sdiv_i128_v_pow2k:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[4:5], s4, v[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s5, 31
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s5, v[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-O0-NEXT:    s_mov_b32 s6, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v5, vcc, v2, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    s_mov_b32 s4, 33
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_lshl_or_b32 v0, v2, s5, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    s_mov_b32 s4, 1
+; GFX9-O0-NEXT:    v_alignbit_b32 v1, v1, v2, s4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+  %div = sdiv i128 %lhs, 8589934592
+  ret i128 %div
+}
+
+define i128 @v_srem_i128_v_pow2k(i128 %lhs) {
+; GFX9-LABEL: v_srem_i128_v_pow2k:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 31, v[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_and_b32_e32 v4, -2, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_srem_i128_v_pow2k:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[6:7], s4, v[6:7]
+; GFX9-O0-NEXT:    s_mov_b32 s4, 31
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b32 s5, s6
+; GFX9-O0-NEXT:    s_mov_b32 s4, s7
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v6, vcc, v5, v4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v4, vcc, v0, v2, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v3, v2, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v1, v2, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT:    s_mov_b32 s6, -2
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX9-O0-NEXT:    s_mov_b32 s5, s6
+; GFX9-O0-NEXT:    s_mov_b32 s6, s5
+; GFX9-O0-NEXT:    v_and_b32_e64 v4, v4, s6
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_and_b32_e64 v9, v6, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v7
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[5:6], s4, v[5:6]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+  %div = srem i128 %lhs, 8589934592
+  ret i128 %div
+}
+
+define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
+; GFX9-LABEL: v_udiv_i128_v_pow2k:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_udiv_i128_v_pow2k:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    s_mov_b32 s4, 33
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    s_mov_b32 s5, 31
+; GFX9-O0-NEXT:    v_lshl_or_b32 v0, v4, s5, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[1:2]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    s_mov_b32 s4, 1
+; GFX9-O0-NEXT:    v_alignbit_b32 v1, v1, v4, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+  %div = udiv i128 %lhs, 8589934592
+  ret i128 %div
+}
+
+define i128 @v_urem_i128_v_pow2k(i128 %lhs) {
+; GFX9-LABEL: v_urem_i128_v_pow2k:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_urem_i128_v_pow2k:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr1 killed $exec
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    s_mov_b32 s6, 1
+; GFX9-O0-NEXT:    s_mov_b32 s4, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX9-O0-NEXT:    s_mov_b32 s5, s6
+; GFX9-O0-NEXT:    s_mov_b32 s6, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    v_and_b32_e64 v3, v2, s6
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_and_b32_e64 v1, v0, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s4, v[1:2]
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+  %div = urem i128 %lhs, 8589934592
+  ret i128 %div
 }
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9-SDAG: {{.*}}
+; GFX9-SDAG-O0: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
new file mode 100644
index 00000000000000..46e2632e45a190
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -0,0 +1,25 @@
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
+
+; SDAG-ERR: LLVM ERROR: unsupported libcall legalization
+; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_v2i128_vv)
+
+define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+  %shl = sdiv <2 x i128> %lhs, %rhs
+  ret <2 x i128> %shl
+}
+
+define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+  %shl = udiv <2 x i128> %lhs, %rhs
+  ret <2 x i128> %shl
+}
+
+define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+  %shl = srem <2 x i128> %lhs, %rhs
+  ret <2 x i128> %shl
+}
+
+define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+  %shl = urem <2 x i128> %lhs, %rhs
+  ret <2 x i128> %shl
+}
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 380a13ed16128f..47110d94918879 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -55,7 +55,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v31, v0
-; GFX12-NEXT:    s_mov_b32 s12, ttmp9
 ; GFX12-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX12-NEXT:    s_mov_b32 s32, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
new file mode 100644
index 00000000000000..f49fec60892cda
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
+; GFX12-LABEL: test_cvt_f32_bf8_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    ; return to shader part epilog
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 0)
+  ret float %ret
+}
+
+define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
+; GFX12-LABEL: test_cvt_f32_bf8_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT:    ; return to shader part epilog
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1)
+  ret float %ret
+}
+
+define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
+; GFX12-LABEL: test_cvt_f32_bf8_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    ; return to shader part epilog
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2)
+  ret float %ret
+}
+
+define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
+; GFX12-LABEL: test_cvt_f32_fp8_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT:    ; return to shader part epilog
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3)
+  ret float %ret
+}
+
+define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %tmp1, i32 %r, i32 %old, i32 0)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 1)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 2)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
+declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
+declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1)
+declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1)
+declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32)
+declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32)
+
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
+declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
+
+attributes #0 = { nounwind convergent }
+attributes #1 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
new file mode 100644
index 00000000000000..d11fb27640ee75
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
@@ -0,0 +1,197 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=gcn-dpp-combine %s -o - | FileCheck -check-prefix=GFX12 %s
+
+---
+name:            test_cvt_f32_bf8_byte0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX12-LABEL: name: test_cvt_f32_bf8_byte0
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]]
+    ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec
+    $vgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+---
+name:            test_cvt_f32_bf8_byte2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX12-LABEL: name: test_cvt_f32_bf8_byte2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_]]
+    ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+---
+name:            test_cvt_f32_fp8_byte3
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX12-LABEL: name: test_cvt_f32_fp8_byte3
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_]]
+    ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+---
+name:            test_cvt_pk_bf8_f32_word0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; GFX12-LABEL: name: test_cvt_pk_bf8_f32_word0
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[V_CVT_PK_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_PK_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_BF8_F32_e64_dpp]], 0, 0, implicit $exec
+    ; GFX12-NEXT: S_ENDPGM 0
+    %4:vgpr_32 = COPY $vgpr4
+    %3:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %1:vgpr_32 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
+    %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %7:vgpr_32 = V_CVT_PK_BF8_F32_e64 0, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_cvt_pk_fp8_f32_word1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; GFX12-LABEL: name: test_cvt_pk_fp8_f32_word1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_PK_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], [[COPY2]], 0, implicit $mode, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_FP8_F32_e64_]], 0, 0, implicit $exec
+    ; GFX12-NEXT: S_ENDPGM 0
+    %4:vgpr_32 = COPY $vgpr4
+    %3:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %1:vgpr_32 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
+    %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %7:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_cvt_sr_bf8_f32_byte0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; GFX12-LABEL: name: test_cvt_sr_bf8_f32_byte0
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[V_CVT_SR_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_SR_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], 0, [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_BF8_F32_e64_dpp]], 0, 0, implicit $exec
+    ; GFX12-NEXT: S_ENDPGM 0
+    %4:vgpr_32 = COPY $vgpr4
+    %3:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %1:vgpr_32 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
+    %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %7:vgpr_32 = V_CVT_SR_BF8_F32_e64 0, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_cvt_sr_fp8_f32_byte2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; GFX12-LABEL: name: test_cvt_sr_fp8_f32_byte2
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_SR_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], 0, [[COPY2]], 0, implicit $mode, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_FP8_F32_e64_]], 0, 0, implicit $exec
+    ; GFX12-NEXT: S_ENDPGM 0
+    %4:vgpr_32 = COPY $vgpr4
+    %3:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %1:vgpr_32 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
+    %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %7:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 26d0d702d99db4..17b1fcf865e94e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -1,4 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
 
 declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
 declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
@@ -9,182 +11,524 @@ declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1)
 declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32)
 declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32)
 
-; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte0:
-; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0{{$}}
 define float @test_cvt_f32_bf8_byte0(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_bf8_byte0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_bf8_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_bf8_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte1:
-; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1
 define float @test_cvt_f32_bf8_byte1(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_bf8_byte1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_bf8_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte2:
-; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2
 define float @test_cvt_f32_bf8_byte2(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_bf8_byte2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_bf8_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte3:
-; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3
 define float @test_cvt_f32_bf8_byte3(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_bf8_byte3:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_bf8_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte0:
-; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0{{$}}
 define float @test_cvt_f32_fp8_byte0(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_fp8_byte0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_fp8_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_fp8_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte1:
-; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1
 define float @test_cvt_f32_fp8_byte1(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_fp8_byte1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_fp8_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte2:
-; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2
 define float @test_cvt_f32_fp8_byte2(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_fp8_byte2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_fp8_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte3:
-; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3
 define float @test_cvt_f32_fp8_byte3(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_fp8_byte3:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_fp8_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word0:
-; GCN: v_cvt_pk_f32_bf8_e32 v[0:1], v0{{$}}
 define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) {
+; GFX940-LABEL: test_cvt_pk_f32_bf8_word0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_f32_bf8_e32 v[0:1], v0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_f32_bf8_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_f32_bf8_e32 v[0:1], v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
   ret <2 x float> %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word1:
-; GCN: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1
 define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) {
+; GFX940-LABEL: test_cvt_pk_f32_bf8_word1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_f32_bf8_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true)
   ret <2 x float> %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word0:
-; GCN: v_cvt_pk_f32_fp8_e32 v[0:1], v0{{$}}
 define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) {
+; GFX940-LABEL: test_cvt_pk_f32_fp8_word0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_f32_fp8_e32 v[0:1], v0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_f32_fp8_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_f32_fp8_e32 v[0:1], v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false)
   ret <2 x float> %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word1:
-; GCN: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1
 define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) {
+; GFX940-LABEL: test_cvt_pk_f32_fp8_word1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_f32_fp8_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
   ret <2 x float> %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word0:
-; GCN: v_cvt_pk_bf8_f32 v2, v0, v1{{$}}
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
+; GFX940-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word1:
-; GCN: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
+; GFX940-LABEL: test_cvt_pk_bf8_f32_word1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_bf8_f32_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word0:
-; GCN: v_cvt_pk_fp8_f32 v2, v0, v1{{$}}
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
+; GFX940-LABEL: test_cvt_pk_fp8_f32_word0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_fp8_f32_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word1:
-; GCN: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
+; GFX940-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte0:
-; GCN: v_cvt_sr_bf8_f32 v2, v0, v1{{$}}
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_bf8_f32_byte0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte1:
-; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_bf8_f32_byte1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte2:
-; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_bf8_f32_byte2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte3:
-; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_bf8_f32_byte3:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte0:
-; GCN: v_cvt_sr_fp8_f32 v2, v0, v1{{$}}
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_fp8_f32_byte0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte1:
-; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_fp8_f32_byte1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte2:
-; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_fp8_f32_byte2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte3:
-; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_fp8_f32_byte3:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3)
   ret i32 %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
new file mode 100644
index 00000000000000..df5533b6295023
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX9-SDAG-ERR %s
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX9-GISEL-ERR %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+; GFX9-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.wave.id
+; GFX9-GISEL-ERR: LLVM ERROR: unable to legalize instruction: {{.*}} = G_INTRINSIC intrinsic(@llvm.amdgcn.wave.id)
+
+define amdgpu_cs void @test_wave_id(ptr addrspace(1) %out) {
+; GFX9-LABEL: test_wave_id:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bfe_u32 s0, ttmp8, 0x50019
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_wave_id:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_bfe_u32 s0, ttmp8, 0x50019
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %waveid = call i32 @llvm.amdgcn.wave.id()
+  store i32 %waveid, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_gfx void @test_wave_id_callable(ptr addrspace(1) %out) {
+; GFX9-LABEL: test_wave_id_callable:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_bfe_u32 s34, ttmp8, 0x50019
+; GFX9-NEXT:    v_mov_b32_e32 v2, s34
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_wave_id_callable:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_bfe_u32 s0, ttmp8, 0x50019
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %waveid = call i32 @llvm.amdgcn.wave.id()
+  store i32 %waveid, ptr addrspace(1) %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wave.id()
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
new file mode 100644
index 00000000000000..afa914c8375f64
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_kernel void @workgroup_ids_kernel() {
+; GFX9-LABEL: workgroup_ids_kernel:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel:
+; GFX9ARCH-SDAG:       ; %bb.0: ; %.entry
+; GFX9ARCH-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9ARCH-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-SDAG-NEXT:    s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: workgroup_ids_kernel:
+; GFX9ARCH-GISEL:       ; %bb.0: ; %.entry
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9ARCH-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: workgroup_ids_kernel:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: workgroup_ids_kernel:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @caller() {
+; GFX9-SDAG-LABEL: caller:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-SDAG-NEXT:    s_mov_b32 s38, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-SDAG-NEXT:    s_add_u32 s36, s36, s7
+; GFX9-SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-SDAG-NEXT:    s_add_u32 s8, s2, 36
+; GFX9-SDAG-NEXT:    s_addc_u32 s9, s3, 0
+; GFX9-SDAG-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-SDAG-NEXT:    s_add_u32 s2, s2, callee@gotpcrel32@lo+4
+; GFX9-SDAG-NEXT:    s_addc_u32 s3, s3, callee@gotpcrel32@hi+12
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, s6
+; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[14:15]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: caller:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-GISEL-NEXT:    s_mov_b32 s38, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-GISEL-NEXT:    s_add_u32 s36, s36, s7
+; GFX9-GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s8, s2, 36
+; GFX9-GISEL-NEXT:    s_addc_u32 s9, s3, 0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x0
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-GISEL-NEXT:    s_mov_b32 s12, s6
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[14:15]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: caller:
+; GFX9ARCH-SDAG:       ; %bb.0:
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s38, -1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s36, s36, s6
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s2, 36
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s3, 0
+; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[2:3]
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s2, s2, callee@gotpcrel32@lo+4
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s3, s3, callee@gotpcrel32@hi+12
+; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9ARCH-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9ARCH-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9ARCH-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-SDAG-NEXT:    s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: caller:
+; GFX9ARCH-GISEL:       ; %bb.0:
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s38, -1
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s36, s36, s6
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s2, 36
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s3, 0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9ARCH-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9ARCH-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9ARCH-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: caller:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
+; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX12-SDAG-NEXT:    s_mov_b32 s7, callee@abs32@hi
+; GFX12-SDAG-NEXT:    s_mov_b32 s6, callee@abs32@lo
+; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: caller:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
+; GFX12-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, callee@abs32@lo
+; GFX12-GISEL-NEXT:    s_mov_b32 s7, callee@abs32@hi
+; GFX12-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX12-GISEL-NEXT:    s_endpgm
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  call void @callee(i32 %idx) #0
+  ret void
+}
+
+declare void @callee(i32) #0
+
+define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-LABEL: workgroup_ids_device_func:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9ARCH-SDAG-LABEL: workgroup_ids_device_func:
+; GFX9ARCH-SDAG:       ; %bb.0:
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9ARCH-SDAG-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX9ARCH-SDAG-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9ARCH-SDAG-NEXT:    s_lshr_b32 s4, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9ARCH-SDAG-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9ARCH-GISEL-LABEL: workgroup_ids_device_func:
+; GFX9ARCH-GISEL:       ; %bb.0:
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX9ARCH-GISEL-NEXT:    s_lshr_b32 s5, ttmp7, 16
+; GFX9ARCH-GISEL-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9ARCH-GISEL-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9ARCH-GISEL-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: workgroup_ids_device_func:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
+; GFX12-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-NEXT:    v_mov_b32_e32 v8, s1
+; GFX12-NEXT:    global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_store_b32 v[2:3], v7, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_store_b32 v[4:5], v8, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %id.x, ptr addrspace(1) %outx
+  store volatile i32 %id.y, ptr addrspace(1) %outy
+  store volatile i32 %id.z, ptr addrspace(1) %outz
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
+
+attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9ARCH: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
new file mode 100644
index 00000000000000..cfff0a969da9e7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_cs void @_amdgpu_cs_main() {
+; GFX9-LABEL: _amdgpu_cs_main:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main:
+; GFX9ARCH-SDAG:       ; %bb.0: ; %.entry
+; GFX9ARCH-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9ARCH-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-SDAG-NEXT:    s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: _amdgpu_cs_main:
+; GFX9ARCH-GISEL:       ; %bb.0: ; %.entry
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9ARCH-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: _amdgpu_cs_main:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: _amdgpu_cs_main:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @caller() {
+; GFX9-LABEL: caller:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: caller:
+; GFX9ARCH-SDAG:       ; %bb.0:
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s10, -1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s8, s0
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[0:1]
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9ARCH-SDAG-NEXT:    s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: caller:
+; GFX9ARCH-GISEL:       ; %bb.0:
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s10, -1
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s8, s0
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9ARCH-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: caller:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, callee@abs32@hi
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: caller:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, callee@abs32@hi
+; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT:    s_endpgm
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  call amdgpu_gfx void @callee(i32 %idx)
+  ret void
+}
+
+declare amdgpu_gfx void @callee(i32)
+
+define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-LABEL: workgroup_ids_gfx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9ARCH-LABEL: workgroup_ids_gfx:
+; GFX9ARCH:       ; %bb.0:
+; GFX9ARCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: workgroup_ids_gfx:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %id.x, ptr addrspace(1) %outx
+  store volatile i32 %id.y, ptr addrspace(1) %outy
+  store volatile i32 %id.z, ptr addrspace(1) %outz
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9-GISEL: {{.*}}
+; GFX9-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
deleted file mode 100644
index 495b54758de049..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
-
-define amdgpu_cs void @_amdgpu_cs_main() {
-; GFX9-SDAG-LABEL: _amdgpu_cs_main:
-; GFX9-SDAG:       ; %bb.0: ; %.entry
-; GFX9-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 16
-; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: _amdgpu_cs_main:
-; GFX9-GISEL:       ; %bb.0: ; %.entry
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
-; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
-; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: _amdgpu_cs_main:
-; GFX12-SDAG:       ; %bb.0: ; %.entry
-; GFX12-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 16
-; GFX12-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: _amdgpu_cs_main:
-; GFX12-GISEL:       ; %bb.0: ; %.entry
-; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
-; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
-; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT:    s_endpgm
-.entry:
-  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
-  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
-  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
-  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
-  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
-  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
-  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define amdgpu_cs void @caller() {
-; GFX9-SDAG-LABEL: caller:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s0
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, callee@abs32@hi
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, callee@abs32@lo
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_add_u32 s8, s8, s0
-; GFX9-SDAG-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[8:9]
-; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: caller:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-GISEL-NEXT:    s_mov_b32 s8, s0
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, callee@abs32@lo
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, callee@abs32@hi
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_add_u32 s8, s8, s0
-; GFX9-GISEL-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[8:9]
-; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: caller:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, callee@abs32@hi
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, callee@abs32@lo
-; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
-; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: caller:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX12-GISEL-NEXT:    s_mov_b32 s0, callee@abs32@lo
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, callee@abs32@hi
-; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
-; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX12-GISEL-NEXT:    s_endpgm
-  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
-  call amdgpu_gfx void @callee(i32 %idx)
-  ret void
-}
-
-declare amdgpu_gfx void @callee(i32)
-
-declare i32 @llvm.amdgcn.workgroup.id.x()
-declare i32 @llvm.amdgcn.workgroup.id.y()
-declare i32 @llvm.amdgcn.workgroup.id.z()
-declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index a59c0394bebe20..ca7486536cf556 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -582,5 +582,170 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @flat_nontemporal_volatile_load(
+; GFX7-LABEL: flat_nontemporal_volatile_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
+; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
+; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+    ptr %in, ptr %out) {
+entry:
+  %val = load volatile i32, ptr %in, align 4, !nontemporal !0
+  store i32 %val, ptr %out
+  ret void
+}
+
 !0 = !{i32 1}
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 07a3f85066991b..b57d539ac18f0b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -576,5 +576,163 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @global_nontemporal_volatile_load(
+; GFX6-LABEL: global_nontemporal_volatile_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s4, s2
+; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_nontemporal_volatile_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_nontemporal_volatile_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_nontemporal_volatile_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_nontemporal_volatile_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_nontemporal_volatile_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
+; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_nontemporal_volatile_load:
+; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_nontemporal_volatile_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT:    s_nop 0
+; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_nontemporal_volatile_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT:    s_nop 0
+; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_nontemporal_volatile_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-WGP-NEXT:    s_nop 0
+; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_nontemporal_volatile_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-CU-NEXT:    s_nop 0
+; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    s_endpgm
+    ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
 !0 = !{i32 1}
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 70c91d26006026..61e0ba8bd968ed 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -615,5 +615,184 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @local_nontemporal_volatile_load(
+; GFX6-LABEL: local_nontemporal_volatile_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_nontemporal_volatile_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    ds_read_b32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_nontemporal_volatile_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_nontemporal_volatile_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_nontemporal_volatile_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_nontemporal_volatile_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
+; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_nontemporal_volatile_load:
+; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_nontemporal_volatile_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    s_nop 0
+; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_nontemporal_volatile_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    s_nop 0
+; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_nontemporal_volatile_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    s_nop 0
+; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_nontemporal_volatile_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    s_nop 0
+; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    s_endpgm
+    ptr addrspace(3) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
 !0 = !{i32 1}
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 069443791b3e1b..30296a9c3d8963 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -813,5 +813,208 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @private_nontemporal_volatile_load(
+; GFX6-LABEL: private_nontemporal_volatile_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX6-NEXT:    s_add_u32 s8, s8, s7
+; GFX6-NEXT:    s_addc_u32 s9, s9, 0
+; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_nontemporal_volatile_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_add_u32 s8, s8, s7
+; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-WGP-NEXT:    s_clause 0x1
+; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_nontemporal_volatile_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX10-CU-NEXT:    s_clause 0x1
+; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
+; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_nontemporal_volatile_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
+; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s4 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: private_nontemporal_volatile_load:
+; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    scratch_load_dword v0, off, s4 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_nontemporal_volatile_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s2 glc dlc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    s_nop 0
+; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_nontemporal_volatile_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s2 glc dlc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    s_nop 0
+; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_nontemporal_volatile_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    s_nop 0
+; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_nontemporal_volatile_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    s_nop 0
+; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    s_endpgm
+    ptr addrspace(5) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load volatile i32, ptr addrspace(5) %in, align 4, !nontemporal !0
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
 !0 = !{i32 1}
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index 15af1f17e230ec..f1e2737b370ef0 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -84,4 +84,58 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @memset_array_ptr_alloca(ptr %out) {
+; CHECK-LABEL: @memset_array_ptr_alloca(
+; CHECK-NEXT:    store i64 0, ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [6 x ptr], align 16, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
+  %load = load i64, ptr addrspace(5) %alloca
+  store i64 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @memset_vector_ptr_alloca(ptr %out) {
+; CHECK-LABEL: @memset_vector_ptr_alloca(
+; CHECK-NEXT:    store i64 0, ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <6 x ptr>, align 16, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
+  %load = load i64, ptr addrspace(5) %alloca
+  store i64 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) {
+; CHECK-LABEL: @memset_array_of_array_ptr_alloca(
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x [3 x ptr]], align 16, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
+  %load = load i64, ptr addrspace(5) %alloca
+  store i64 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @memset_array_of_vec_ptr_alloca(ptr %out) {
+; CHECK-LABEL: @memset_array_of_vec_ptr_alloca(
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x <3 x ptr>], align 16, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x <3 x ptr>], align 16, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
+  %load = load i64, ptr addrspace(5) %alloca
+  store i64 %load, ptr %out
+  ret void
+}
+
 declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
new file mode 100644
index 00000000000000..fb2af36839b5d2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,499 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x half> %C
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <16 x half> %B
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <16 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %fneg.fabs.C = fneg <8 x float> %fabs.C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+  %fneg.fabs.C = fneg <8 x half> %fabs.C
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %el3 = extractelement <8 x float> %C, i32 3
+  %el3.fabs = call float @llvm.fabs.f32(float %el3)
+  %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
+  %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    flat_load_b128 v[12:15], v[8:9] offset:16
+; GFX12-NEXT:    flat_load_b128 v[16:19], v[8:9]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x101
+; GFX12-NEXT:    v_perm_b32 v15, v15, v14, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v14, v13, v12, 0x5040100
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_perm_b32 v13, v19, v18, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v12, v17, v16, 0x5040100
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[10:11], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %C = load <16 x half>, ptr %Caddr
+  %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %fneg.C_shuffle = fneg <8 x half> %C_shuffle
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
new file mode 100644
index 00000000000000..51f93b57f38e90
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
@@ -0,0 +1,431 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT:    v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
+; GFX12-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
+; GFX12-NEXT:    v_mov_b32_e32 v17, v10
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT:    v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
+; GFX12-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
+; GFX12-NEXT:    v_mov_b32_e32 v17, v10
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v10, 0x42004200
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT:    v_mov_b32_e32 v13, v10
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v10, 0x3fc03fc0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT:    v_mov_b32_e32 v13, v10
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX12-NEXT:    v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
+; GFX12-NEXT:    v_mov_b32_e32 v11, v4
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
new file mode 100644
index 00000000000000..48297932aa5984
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
new file mode 100644
index 00000000000000..43538768f00fd9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX12-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX12-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX12-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
+  store <8 x half> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
+  store <8 x half> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX12-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
+  store <8 x i16> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
+  store <8 x i16> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+  store <8 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+  store <8 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v11, v[11:12], off
+; GFX12-NEXT:    v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX12-NEXT:    v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX12-NEXT:    v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX12-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX12-NEXT:    global_store_b128 v[13:14], v[17:20], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+  store <8 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+  store <8 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
new file mode 100644
index 00000000000000..2db3b07de54d0a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
new file mode 100644
index 00000000000000..1f2ffaf7627125
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,456 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x half> %C
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %fneg.fabs.C = fneg <4 x float> %fabs.C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+  %fneg.fabs.C = fneg <4 x half> %fabs.C
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %el3 = extractelement <4 x float> %C, i32 3
+  %el3.fabs = call float @llvm.fabs.f32(float %el3)
+  %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
+  %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b128 v[8:11], v[4:5]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %C = load <8 x half>, ptr %Caddr
+  %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %fneg.C_shuffle = fneg <4 x half> %C_shuffle
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
new file mode 100644
index 00000000000000..fa0a7c98cea323
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -0,0 +1,373 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v7, v6
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v9, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v7, v6
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v9, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x42004200
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v7, v6
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x3fc03fc0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v7, v6
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
new file mode 100644
index 00000000000000..803453e45ced05
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
new file mode 100644
index 00000000000000..0c5b19a8416fb8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -0,0 +1,472 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX12-NEXT:    v_mov_b32_e32 v23, v9
+; GFX12-NEXT:    v_mov_b32_e32 v22, v8
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v9
+; GFX12-NEXT:    v_mov_b32_e32 v26, v8
+; GFX12-NEXT:    v_mov_b32_e32 v25, v7
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v31, v9
+; GFX12-NEXT:    v_mov_b32_e32 v30, v8
+; GFX12-NEXT:    v_mov_b32_e32 v29, v7
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX12-NEXT:    v_mov_b32_e32 v23, v9
+; GFX12-NEXT:    v_mov_b32_e32 v22, v8
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v9
+; GFX12-NEXT:    v_mov_b32_e32 v26, v8
+; GFX12-NEXT:    v_mov_b32_e32 v25, v7
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v31, v9
+; GFX12-NEXT:    v_mov_b32_e32 v30, v8
+; GFX12-NEXT:    v_mov_b32_e32 v29, v7
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX12-NEXT:    v_mov_b32_e32 v9, v7
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v7
+; GFX12-NEXT:    v_mov_b32_e32 v18, v6
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
+  store <4 x half> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
+  store <4 x half> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
+  store <4 x half> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
+  store <4 x half> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX12-NEXT:    v_mov_b32_e32 v9, v7
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v7
+; GFX12-NEXT:    v_mov_b32_e32 v18, v6
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
+  store <4 x i16> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
+  store <4 x i16> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
+  store <4 x i16> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
+  store <4 x i16> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
+  store <4 x i32> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
+  store <4 x i32> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v6, v[6:7], off
+; GFX12-NEXT:    v_mov_b32_e32 v15, v5
+; GFX12-NEXT:    v_mov_b32_e32 v14, v4
+; GFX12-NEXT:    v_mov_b32_e32 v13, v3
+; GFX12-NEXT:    v_mov_b32_e32 v12, v2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v16, v6
+; GFX12-NEXT:    v_mov_b32_e32 v15, v5
+; GFX12-NEXT:    v_mov_b32_e32 v14, v4
+; GFX12-NEXT:    v_mov_b32_e32 v13, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    global_store_b128 v[9:10], v[13:16], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
new file mode 100644
index 00000000000000..b25f2785133310
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
new file mode 100644
index 00000000000000..33c32d7290da41
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
@@ -0,0 +1,354 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+
+# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
+#  $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
+#  $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
+
+---
+name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+---
+name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+
+    ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+
+    ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
new file mode 100644
index 00000000000000..ab89feb861b5e3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
@@ -0,0 +1,355 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+
+# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
+#  $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
+#  $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
+
+---
+name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+
+    ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+
+    ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index 769e6b0964abdb..40e4692a18ec79 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -5,43 +5,25 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
-; GFX9-SDAG-LABEL: workgroup_id_x:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GFX9-GISEL-LABEL: workgroup_id_x:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: workgroup_id_x:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX9-LABEL: workgroup_id_x:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: workgroup_id_x:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT:    s_endpgm
+; GFX12-LABEL: workgroup_id_x:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx
 
@@ -52,23 +34,23 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 ; GFX9-LABEL: workgroup_id_xy:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, ttmp7
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: workgroup_id_xy:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
-; GFX12-NEXT:    v_mov_b32_e32 v2, ttmp7
+; GFX12-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT:    global_store_b32 v2, v1, s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -81,37 +63,21 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 }
 
 define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) {
-; GFX9-SDAG-LABEL: workgroup_id_xyz:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: workgroup_id_xyz:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9-GISEL-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    s_lshr_b32 s0, ttmp7, 16
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: workgroup_id_xyz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_and_b32 s6, ttmp7, 0xffff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: workgroup_id_xyz:
 ; GFX12:       ; %bb.0:
@@ -119,15 +85,15 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
 ; GFX12-NEXT:    s_and_b32 s2, ttmp7, 0xffff
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
+; GFX12-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_lshr_b32 s3, ttmp7, 16
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x2
-; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX12-NEXT:    global_store_b32 v0, v2, s[6:7]
-; GFX12-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    global_store_b32 v1, v2, s[6:7]
+; GFX12-NEXT:    global_store_b32 v1, v3, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -144,3 +110,8 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 declare i32 @llvm.amdgcn.workgroup.id.x()
 declare i32 @llvm.amdgcn.workgroup.id.y()
 declare i32 @llvm.amdgcn.workgroup.id.z()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}
+; GFX9-GISEL: {{.*}}
+; GFX9-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll b/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll
new file mode 100644
index 00000000000000..9494880f990258
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple thumbv7a-none-eabi < %s | FileCheck %s
+
+@val0 = global i32 0, align 4
+@val1 = global i32 0, align 4
+@val2 = global i32 0, align 4
+
+define i32 @foo(ptr %ctx) {
+; CHECK-LABEL: foo:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cbz r0, .LBB0_2
+; CHECK-NEXT:  @ %bb.1: @ %if.end
+; CHECK-NEXT:    movw r12, :lower16:val2
+; CHECK-NEXT:    movw r3, :lower16:val1
+; CHECK-NEXT:    movw r2, :lower16:val0
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    movt r12, :upper16:val2
+; CHECK-NEXT:    movt r3, :upper16:val1
+; CHECK-NEXT:    movt r2, :upper16:val0
+; CHECK-NEXT:    str r2, [r1, #4]
+; CHECK-NEXT:    str r3, [r1, #8]
+; CHECK-NEXT:    str.w r12, [r1, #12]
+; CHECK-NEXT:    str r0, [r1, #16]
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:  .LBB0_2: @ %if.then
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    mov.w r0, #-1
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %tobool.not = icmp eq ptr %ctx, null
+  br i1 %tobool.not, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @bar() #2
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %cmd_a = getelementptr inbounds i8, ptr %ctx, i32 4
+  store ptr @val0, ptr %cmd_a, align 4
+  %cmd_b = getelementptr inbounds i8, ptr %ctx, i32 8
+  store ptr @val1, ptr %cmd_b, align 4
+  %cmd_c = getelementptr inbounds i8, ptr %ctx, i32 12
+  store ptr @val2, ptr %cmd_c, align 4
+  %cmd_d = getelementptr inbounds i8, ptr %ctx, i32 16
+  store ptr null, ptr %cmd_d, align 4
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i32 [ 0, %if.end ], [ -1, %if.then ]
+  ret i32 %retval.0
+}
+
+declare void @bar()
diff --git a/llvm/test/CodeGen/AVR/bug-81911.ll b/llvm/test/CodeGen/AVR/bug-81911.ll
new file mode 100644
index 00000000000000..2a22666a1ff927
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/bug-81911.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=avr -mcpu=atmega328 -O1 -verify-machineinstrs | FileCheck %s
+
+define internal i8 @main() {
+; CHECK-LABEL: main:
+; CHECK:       ; %bb.0: ; %bb0
+; CHECK-NEXT:    push r2
+; CHECK-NEXT:    push r3
+; CHECK-NEXT:    push r4
+; CHECK-NEXT:    push r5
+; CHECK-NEXT:    push r6
+; CHECK-NEXT:    push r7
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    push r11
+; CHECK-NEXT:    push r12
+; CHECK-NEXT:    push r13
+; CHECK-NEXT:    push r14
+; CHECK-NEXT:    push r15
+; CHECK-NEXT:    push r16
+; CHECK-NEXT:    push r17
+; CHECK-NEXT:    push r28
+; CHECK-NEXT:    push r29
+; CHECK-NEXT:    in r28, 61
+; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    sbiw r28, 13
+; CHECK-NEXT:    in r0, 63
+; CHECK-NEXT:    cli
+; CHECK-NEXT:    out 62, r29
+; CHECK-NEXT:    out 63, r0
+; CHECK-NEXT:    out 61, r28
+; CHECK-NEXT:    ldi r16, 0
+; CHECK-NEXT:    ldi r17, 0
+; CHECK-NEXT:    ldi r18, -1
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ldi r24, 123
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    std Y+1, r24 ; 1-byte Folded Spill
+; CHECK-NEXT:    movw r24, r28
+; CHECK-NEXT:    adiw r24, 6
+; CHECK-NEXT:    std Y+3, r25 ; 2-byte Folded Spill
+; CHECK-NEXT:    std Y+2, r24 ; 2-byte Folded Spill
+; CHECK-NEXT:    movw r8, r16
+; CHECK-NEXT:    movw r6, r16
+; CHECK-NEXT:    movw r4, r16
+; CHECK-NEXT:    movw r2, r16
+; CHECK-NEXT:    rjmp .LBB0_2
+; CHECK-NEXT:  .LBB0_1: ; %bb1
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    andi r30, 1
+; CHECK-NEXT:    ldd r31, Y+4 ; 1-byte Folded Reload
+; CHECK-NEXT:    dec r31
+; CHECK-NEXT:    cpi r30, 0
+; CHECK-NEXT:    movw r8, r18
+; CHECK-NEXT:    movw r6, r20
+; CHECK-NEXT:    movw r4, r22
+; CHECK-NEXT:    movw r2, r24
+; CHECK-NEXT:    mov r18, r31
+; CHECK-NEXT:    brne .LBB0_2
+; CHECK-NEXT:    rjmp .LBB0_4
+; CHECK-NEXT:  .LBB0_2: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    std Y+4, r18 ; 1-byte Folded Spill
+; CHECK-NEXT:    movw r18, r8
+; CHECK-NEXT:    movw r20, r6
+; CHECK-NEXT:    movw r22, r4
+; CHECK-NEXT:    movw r24, r2
+; CHECK-NEXT:    ldi r26, 10
+; CHECK-NEXT:    ldi r27, 0
+; CHECK-NEXT:    movw r10, r26
+; CHECK-NEXT:    movw r12, r16
+; CHECK-NEXT:    movw r14, r16
+; CHECK-NEXT:    call __udivdi3
+; CHECK-NEXT:    std Y+13, r25
+; CHECK-NEXT:    std Y+12, r24
+; CHECK-NEXT:    std Y+11, r23
+; CHECK-NEXT:    std Y+10, r22
+; CHECK-NEXT:    std Y+9, r21
+; CHECK-NEXT:    std Y+8, r20
+; CHECK-NEXT:    std Y+7, r19
+; CHECK-NEXT:    std Y+6, r18
+; CHECK-NEXT:    ldd r30, Y+2 ; 2-byte Folded Reload
+; CHECK-NEXT:    ldd r31, Y+3 ; 2-byte Folded Reload
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    ldi r30, 1
+; CHECK-NEXT:    cp r8, r1
+; CHECK-NEXT:    cpc r9, r1
+; CHECK-NEXT:    cpc r6, r16
+; CHECK-NEXT:    cpc r7, r17
+; CHECK-NEXT:    cpc r4, r16
+; CHECK-NEXT:    cpc r5, r17
+; CHECK-NEXT:    cpc r2, r16
+; CHECK-NEXT:    cpc r3, r17
+; CHECK-NEXT:    breq .LBB0_3
+; CHECK-NEXT:    rjmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: ; %bb1
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mov r30, r1
+; CHECK-NEXT:    rjmp .LBB0_1
+; CHECK-NEXT:  .LBB0_4: ; %bb3
+; CHECK-NEXT:    ldd r24, Y+1 ; 1-byte Folded Reload
+; CHECK-NEXT:    std Y+5, r24
+; CHECK-NEXT:    movw r24, r28
+; CHECK-NEXT:    adiw r24, 5
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    ldd r24, Y+5
+; CHECK-NEXT:    adiw r28, 13
+; CHECK-NEXT:    in r0, 63
+; CHECK-NEXT:    cli
+; CHECK-NEXT:    out 62, r29
+; CHECK-NEXT:    out 63, r0
+; CHECK-NEXT:    out 61, r28
+; CHECK-NEXT:    pop r29
+; CHECK-NEXT:    pop r28
+; CHECK-NEXT:    pop r17
+; CHECK-NEXT:    pop r16
+; CHECK-NEXT:    pop r15
+; CHECK-NEXT:    pop r14
+; CHECK-NEXT:    pop r13
+; CHECK-NEXT:    pop r12
+; CHECK-NEXT:    pop r11
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    pop r7
+; CHECK-NEXT:    pop r6
+; CHECK-NEXT:    pop r5
+; CHECK-NEXT:    pop r4
+; CHECK-NEXT:    pop r3
+; CHECK-NEXT:    pop r2
+; CHECK-NEXT:    ret
+bb0:
+  %0 = alloca i64
+  %1 = alloca i8
+  %2 = tail call i8 asm sideeffect "ldi ${0}, 123", "=&r,~{sreg},~{memory}"()
+
+  br label %bb1
+
+bb1:
+  %3 = phi i64 [ %5, %bb1 ], [ 0, %bb0 ]
+  %4 = phi i8 [ %6, %bb1 ], [ 0, %bb0 ]
+  %5 = udiv i64 %3, 10
+  %6 = add i8 %4, 1
+
+  store i64 %5, ptr %0
+  call void asm sideeffect "", "r,~{memory}"(ptr %0)
+
+  %7 = icmp eq i64 %3, 0
+  %8 = icmp eq i8 %6, 0
+
+  br i1 %7, label %bb3, label %bb1
+
+bb3:
+  store i8 %2, ptr %1
+  call void asm sideeffect "", "r,~{memory}"(ptr %1)
+
+  %9 = load i8, ptr %1
+
+  ret i8 %9
+}
diff --git a/llvm/test/CodeGen/LoongArch/addrspacecast.ll b/llvm/test/CodeGen/LoongArch/addrspacecast.ll
new file mode 100644
index 00000000000000..7875562331be09
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/addrspacecast.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --verify-machineinstrs < %s | FileCheck %s --check-prefix=LA32
+; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s | FileCheck %s --check-prefix=LA64
+
+define void @cast0(ptr addrspace(1) %ptr) {
+; LA32-LABEL: cast0:
+; LA32:       # %bb.0:
+; LA32-NEXT:    st.w $zero, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: cast0:
+; LA64:       # %bb.0:
+; LA64-NEXT:    st.w $zero, $a0, 0
+; LA64-NEXT:    ret
+  %ptr0 = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(0)
+  store i32 0, ptr %ptr0
+  ret void
+}
+
+define void @cast1(ptr %ptr) {
+; LA32-LABEL: cast1:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    .cfi_def_cfa_offset 16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    .cfi_offset 1, -4
+; LA32-NEXT:    bl %plt(foo)
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: cast1:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    .cfi_def_cfa_offset 16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    .cfi_offset 1, -8
+; LA64-NEXT:    bl %plt(foo)
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+  %castptr = addrspacecast ptr %ptr to ptr addrspace(10)
+  call void @foo(ptr addrspace(10) %castptr)
+  ret void
+}
+
+declare void @foo(ptr addrspace(10))
diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
index b0f29ee790885d..b84c1093eb75f2 100644
--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
@@ -25,15 +25,16 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    andi $a5, $a5, 255
 ; LA64-NEXT:    sll.w $a5, $a5, $a3
 ; LA64-NEXT:    and $a6, $a2, $a4
-; LA64-NEXT:    or $a6, $a6, $a5
+; LA64-NEXT:    or $a5, $a6, $a5
+; LA64-NEXT:    addi.w $a6, $a2, 0
 ; LA64-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a5, $a0, 0
-; LA64-NEXT:    bne $a5, $a2, .LBB0_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a6, .LBB0_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB0_3 Depth=2
-; LA64-NEXT:    move $a7, $a6
+; LA64-NEXT:    move $a7, $a5
 ; LA64-NEXT:    sc.w $a7, $a0, 0
 ; LA64-NEXT:    beqz $a7, .LBB0_3
 ; LA64-NEXT:    b .LBB0_6
@@ -42,11 +43,9 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB0_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; LA64-NEXT:    addi.w $a6, $a2, 0
-; LA64-NEXT:    move $a2, $a5
-; LA64-NEXT:    bne $a5, $a6, .LBB0_1
+; LA64-NEXT:    bne $a2, $a6, .LBB0_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a5, $a3
+; LA64-NEXT:    srl.w $a0, $a2, $a3
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -77,15 +76,16 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
 ; LA64-NEXT:    sll.w $a5, $a5, $a3
 ; LA64-NEXT:    and $a6, $a2, $a4
-; LA64-NEXT:    or $a6, $a6, $a5
+; LA64-NEXT:    or $a5, $a6, $a5
+; LA64-NEXT:    addi.w $a6, $a2, 0
 ; LA64-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a5, $a0, 0
-; LA64-NEXT:    bne $a5, $a2, .LBB1_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a6, .LBB1_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB1_3 Depth=2
-; LA64-NEXT:    move $a7, $a6
+; LA64-NEXT:    move $a7, $a5
 ; LA64-NEXT:    sc.w $a7, $a0, 0
 ; LA64-NEXT:    beqz $a7, .LBB1_3
 ; LA64-NEXT:    b .LBB1_6
@@ -94,11 +94,9 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB1_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; LA64-NEXT:    addi.w $a6, $a2, 0
-; LA64-NEXT:    move $a2, $a5
-; LA64-NEXT:    bne $a5, $a6, .LBB1_1
+; LA64-NEXT:    bne $a2, $a6, .LBB1_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a5, $a3
+; LA64-NEXT:    srl.w $a0, $a2, $a3
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -107,37 +105,36 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-LABEL: atomicrmw_uinc_wrap_i32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ld.w $a3, $a0, 0
-; LA64-NEXT:    addi.w $a2, $a1, 0
+; LA64-NEXT:    ld.w $a2, $a0, 0
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB2_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB2_3 Depth 2
-; LA64-NEXT:    addi.w $a4, $a3, 0
-; LA64-NEXT:    sltu $a1, $a4, $a2
-; LA64-NEXT:    xori $a1, $a1, 1
-; LA64-NEXT:    addi.d $a5, $a3, 1
-; LA64-NEXT:    masknez $a5, $a5, $a1
+; LA64-NEXT:    addi.w $a3, $a2, 0
+; LA64-NEXT:    sltu $a4, $a3, $a1
+; LA64-NEXT:    xori $a4, $a4, 1
+; LA64-NEXT:    addi.d $a2, $a2, 1
+; LA64-NEXT:    masknez $a4, $a2, $a4
 ; LA64-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a1, $a0, 0
-; LA64-NEXT:    bne $a1, $a3, .LBB2_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a3, .LBB2_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_3 Depth=2
-; LA64-NEXT:    move $a6, $a5
-; LA64-NEXT:    sc.w $a6, $a0, 0
-; LA64-NEXT:    beqz $a6, .LBB2_3
+; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    sc.w $a5, $a0, 0
+; LA64-NEXT:    beqz $a5, .LBB2_3
 ; LA64-NEXT:    b .LBB2_6
 ; LA64-NEXT:  .LBB2_5: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB2_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
-; LA64-NEXT:    move $a3, $a1
-; LA64-NEXT:    bne $a1, $a4, .LBB2_1
+; LA64-NEXT:    bne $a2, $a3, .LBB2_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    move $a0, $a1
+; LA64-NEXT:    move $a0, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i32 %val seq_cst
   ret i32 %result
@@ -209,15 +206,16 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    andi $a6, $a6, 255
 ; LA64-NEXT:    sll.w $a6, $a6, $a3
 ; LA64-NEXT:    and $a7, $a2, $a4
-; LA64-NEXT:    or $a7, $a7, $a6
+; LA64-NEXT:    or $a6, $a7, $a6
+; LA64-NEXT:    addi.w $a7, $a2, 0
 ; LA64-NEXT:  .LBB4_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB4_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a6, $a0, 0
-; LA64-NEXT:    bne $a6, $a2, .LBB4_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a7, .LBB4_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB4_3 Depth=2
-; LA64-NEXT:    move $t0, $a7
+; LA64-NEXT:    move $t0, $a6
 ; LA64-NEXT:    sc.w $t0, $a0, 0
 ; LA64-NEXT:    beqz $t0, .LBB4_3
 ; LA64-NEXT:    b .LBB4_6
@@ -226,11 +224,9 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB4_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
-; LA64-NEXT:    addi.w $a7, $a2, 0
-; LA64-NEXT:    move $a2, $a6
-; LA64-NEXT:    bne $a6, $a7, .LBB4_1
+; LA64-NEXT:    bne $a2, $a7, .LBB4_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a6, $a3
+; LA64-NEXT:    srl.w $a0, $a2, $a3
 ; LA64-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -266,15 +262,16 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    bstrpick.d $a6, $a6, 15, 0
 ; LA64-NEXT:    sll.w $a6, $a6, $a3
 ; LA64-NEXT:    and $a7, $a2, $a4
-; LA64-NEXT:    or $a7, $a7, $a6
+; LA64-NEXT:    or $a6, $a7, $a6
+; LA64-NEXT:    addi.w $a7, $a2, 0
 ; LA64-NEXT:  .LBB5_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB5_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a6, $a0, 0
-; LA64-NEXT:    bne $a6, $a2, .LBB5_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a7, .LBB5_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB5_3 Depth=2
-; LA64-NEXT:    move $t0, $a7
+; LA64-NEXT:    move $t0, $a6
 ; LA64-NEXT:    sc.w $t0, $a0, 0
 ; LA64-NEXT:    beqz $t0, .LBB5_3
 ; LA64-NEXT:    b .LBB5_6
@@ -283,11 +280,9 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB5_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
-; LA64-NEXT:    addi.w $a7, $a2, 0
-; LA64-NEXT:    move $a2, $a6
-; LA64-NEXT:    bne $a6, $a7, .LBB5_1
+; LA64-NEXT:    bne $a2, $a7, .LBB5_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a6, $a3
+; LA64-NEXT:    srl.w $a0, $a2, $a3
 ; LA64-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -296,22 +291,22 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-LABEL: atomicrmw_udec_wrap_i32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ld.w $a4, $a0, 0
+; LA64-NEXT:    ld.w $a2, $a0, 0
 ; LA64-NEXT:    addi.w $a3, $a1, 0
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB6_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB6_3 Depth 2
-; LA64-NEXT:    addi.w $a5, $a4, 0
-; LA64-NEXT:    sltu $a2, $a3, $a5
-; LA64-NEXT:    addi.d $a6, $a4, -1
-; LA64-NEXT:    masknez $a6, $a6, $a2
-; LA64-NEXT:    maskeqz $a2, $a1, $a2
-; LA64-NEXT:    or $a2, $a2, $a6
-; LA64-NEXT:    sltui $a6, $a5, 1
-; LA64-NEXT:    masknez $a2, $a2, $a6
-; LA64-NEXT:    maskeqz $a6, $a1, $a6
-; LA64-NEXT:    or $a6, $a6, $a2
+; LA64-NEXT:    addi.w $a4, $a2, 0
+; LA64-NEXT:    sltu $a5, $a3, $a4
+; LA64-NEXT:    addi.d $a2, $a2, -1
+; LA64-NEXT:    masknez $a2, $a2, $a5
+; LA64-NEXT:    maskeqz $a5, $a1, $a5
+; LA64-NEXT:    or $a2, $a5, $a2
+; LA64-NEXT:    sltui $a5, $a4, 1
+; LA64-NEXT:    masknez $a2, $a2, $a5
+; LA64-NEXT:    maskeqz $a5, $a1, $a5
+; LA64-NEXT:    or $a5, $a5, $a2
 ; LA64-NEXT:  .LBB6_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB6_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
@@ -319,17 +314,16 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:    bne $a2, $a4, .LBB6_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_3 Depth=2
-; LA64-NEXT:    move $a7, $a6
-; LA64-NEXT:    sc.w $a7, $a0, 0
-; LA64-NEXT:    beqz $a7, .LBB6_3
+; LA64-NEXT:    move $a6, $a5
+; LA64-NEXT:    sc.w $a6, $a0, 0
+; LA64-NEXT:    beqz $a6, .LBB6_3
 ; LA64-NEXT:    b .LBB6_6
 ; LA64-NEXT:  .LBB6_5: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB6_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; LA64-NEXT:    move $a4, $a2
-; LA64-NEXT:    bne $a2, $a5, .LBB6_1
+; LA64-NEXT:    bne $a2, $a4, .LBB6_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64-NEXT:    move $a0, $a2
 ; LA64-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/bstrins_w.ll b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
index dfbe000841cdcb..e008caacad2a17 100644
--- a/llvm/test/CodeGen/LoongArch/bstrins_w.ll
+++ b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
@@ -145,6 +145,19 @@ define i32 @pat5(i32 %a) nounwind {
   ret i32 %or
 }
 
+;; The high bits of `const` are zero.
+define i32 @pat5_high_zeros(i32 %a) nounwind {
+; CHECK-LABEL: pat5_high_zeros:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 1
+; CHECK-NEXT:    ori $a1, $a1, 564
+; CHECK-NEXT:    bstrins.w $a0, $a1, 31, 16
+; CHECK-NEXT:    ret
+  %and = and i32 %a, 65535      ; 0x0000ffff
+  %or = or i32 %and, 305397760  ; 0x12340000
+  ret i32 %or
+}
+
 ;; Pattern 6: a = b | ((c & mask) << shamt)
 ;; In this testcase b is 0x10000002, but in fact we do not require b being a
 ;; constant. As long as all positions in b to be overwritten by the incoming
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
index 417c865f6383ff..31ecec6ea8051b 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
@@ -69,6 +69,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 define void @cmpxchg_i32_acquire_acquire(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_acquire_acquire:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB2_3
@@ -172,6 +173,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 define void @cmpxchg_i32_acquire_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_acquire_monotonic:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB6_3
@@ -279,9 +281,10 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou
 define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti32:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a3, $a1, 0
 ; LA64-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a3, $a0, 0
-; LA64-NEXT:    bne $a3, $a1, .LBB10_3
+; LA64-NEXT:    ll.w $a1, $a0, 0
+; LA64-NEXT:    bne $a1, $a3, .LBB10_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
 ; LA64-NEXT:    move $a4, $a2
 ; LA64-NEXT:    sc.w $a4, $a0, 0
@@ -290,7 +293,7 @@ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nou
 ; LA64-NEXT:  .LBB10_3:
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB10_4:
-; LA64-NEXT:    move $a0, $a3
+; LA64-NEXT:    move $a0, $a1
 ; LA64-NEXT:    ret
   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
   %res = extractvalue { i32, i1 } %tmp, 0
@@ -396,6 +399,7 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw
 define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti1:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB14_3
@@ -407,8 +411,7 @@ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounw
 ; LA64-NEXT:  .LBB14_3:
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB14_4:
-; LA64-NEXT:    addi.w $a0, $a1, 0
-; LA64-NEXT:    xor $a0, $a3, $a0
+; LA64-NEXT:    xor $a0, $a3, $a1
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
@@ -506,6 +509,7 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 define void @cmpxchg_i32_monotonic_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB18_3
@@ -613,9 +617,10 @@ define i16 @cmpxchg_i16_monotonic_monotonic_reti16(ptr %ptr, i16 %cmp, i16 %val)
 define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti32:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a3, $a1, 0
 ; LA64-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a3, $a0, 0
-; LA64-NEXT:    bne $a3, $a1, .LBB22_3
+; LA64-NEXT:    ll.w $a1, $a0, 0
+; LA64-NEXT:    bne $a1, $a3, .LBB22_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
 ; LA64-NEXT:    move $a4, $a2
 ; LA64-NEXT:    sc.w $a4, $a0, 0
@@ -624,7 +629,7 @@ define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val)
 ; LA64-NEXT:  .LBB22_3:
 ; LA64-NEXT:    dbar 1792
 ; LA64-NEXT:  .LBB22_4:
-; LA64-NEXT:    move $a0, $a3
+; LA64-NEXT:    move $a0, $a1
 ; LA64-NEXT:    ret
   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
   %res = extractvalue { i32, i1 } %tmp, 0
@@ -730,6 +735,7 @@ define i1 @cmpxchg_i16_monotonic_monotonic_reti1(ptr %ptr, i16 %cmp, i16 %val) n
 define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti1:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB26_3
@@ -741,8 +747,7 @@ define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) n
 ; LA64-NEXT:  .LBB26_3:
 ; LA64-NEXT:    dbar 1792
 ; LA64-NEXT:  .LBB26_4:
-; LA64-NEXT:    addi.w $a0, $a1, 0
-; LA64-NEXT:    xor $a0, $a3, $a0
+; LA64-NEXT:    xor $a0, $a3, $a1
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
index 589360823b1488..4d8160d7080340 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@@ -16,6 +16,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -33,8 +34,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB0_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB0_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB0_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -51,6 +51,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -68,8 +69,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB0_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB0_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB0_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 acquire, align 4
@@ -90,6 +90,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -107,8 +108,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB1_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB1_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB1_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -125,6 +125,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -142,8 +143,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB1_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB1_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB1_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 acquire, align 4
@@ -165,6 +165,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -182,8 +183,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB2_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB2_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB2_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB2_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -201,6 +201,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -218,8 +219,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB2_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB2_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB2_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB2_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 acquire, align 4
@@ -241,6 +241,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB3_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB3_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -258,8 +259,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB3_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB3_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB3_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -277,6 +277,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB3_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB3_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -294,8 +295,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB3_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB3_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB3_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 acquire, align 4
@@ -694,6 +694,7 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB8_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB8_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -711,8 +712,7 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB8_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB8_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB8_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB8_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -729,6 +729,7 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB8_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB8_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -746,8 +747,7 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB8_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB8_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB8_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB8_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 release, align 4
@@ -768,6 +768,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB9_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB9_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -785,8 +786,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB9_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB9_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB9_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB9_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -803,6 +803,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB9_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB9_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -820,8 +821,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB9_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB9_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB9_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB9_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 release, align 4
@@ -843,6 +843,7 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB10_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB10_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -860,8 +861,7 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB10_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB10_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB10_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB10_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -879,6 +879,7 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB10_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB10_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -896,8 +897,7 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB10_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB10_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB10_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB10_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 release, align 4
@@ -919,6 +919,7 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB11_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB11_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -936,8 +937,7 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB11_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB11_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB11_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB11_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -955,6 +955,7 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB11_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB11_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -972,8 +973,7 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB11_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB11_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB11_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB11_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 release, align 4
@@ -1372,6 +1372,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB16_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB16_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1389,8 +1390,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB16_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB16_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB16_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB16_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -1407,6 +1407,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB16_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB16_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1424,8 +1425,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB16_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB16_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB16_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB16_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 acq_rel, align 4
@@ -1446,6 +1446,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB17_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB17_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1463,8 +1464,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB17_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB17_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB17_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB17_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -1481,6 +1481,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB17_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB17_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1498,8 +1499,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB17_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB17_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB17_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB17_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 acq_rel, align 4
@@ -1521,6 +1521,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB18_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB18_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1538,8 +1539,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB18_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB18_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB18_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB18_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -1557,6 +1557,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB18_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB18_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1574,8 +1575,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB18_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB18_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB18_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB18_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 acq_rel, align 4
@@ -1597,6 +1597,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB19_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB19_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1614,8 +1615,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB19_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB19_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB19_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB19_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -1633,6 +1633,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB19_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB19_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1650,8 +1651,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB19_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB19_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB19_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB19_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 acq_rel, align 4
@@ -2074,6 +2074,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB24_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB24_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2091,8 +2092,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB24_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB24_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB24_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB24_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2109,6 +2109,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB24_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB24_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2126,8 +2127,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB24_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB24_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB24_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB24_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 seq_cst, align 4
@@ -2148,6 +2148,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB25_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB25_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2165,8 +2166,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB25_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB25_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB25_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB25_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2183,6 +2183,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB25_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB25_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2200,8 +2201,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB25_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB25_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB25_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB25_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 seq_cst, align 4
@@ -2223,6 +2223,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB26_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB26_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2240,8 +2241,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB26_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB26_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB26_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB26_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2259,6 +2259,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB26_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB26_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2276,8 +2277,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB26_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB26_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB26_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB26_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 seq_cst, align 4
@@ -2299,6 +2299,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB27_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB27_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2316,8 +2317,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB27_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB27_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB27_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB27_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2335,6 +2335,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB27_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB27_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2352,8 +2353,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB27_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB27_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB27_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB27_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 seq_cst, align 4
@@ -2752,6 +2752,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB32_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB32_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2769,8 +2770,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB32_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB32_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB32_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB32_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2787,6 +2787,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB32_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB32_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2804,8 +2805,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB32_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB32_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB32_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB32_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 monotonic, align 4
@@ -2826,6 +2826,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB33_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB33_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2843,8 +2844,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB33_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB33_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB33_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB33_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2861,6 +2861,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB33_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB33_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2878,8 +2879,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB33_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB33_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB33_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB33_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 monotonic, align 4
@@ -2901,6 +2901,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB34_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB34_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2918,8 +2919,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB34_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB34_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB34_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB34_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2937,6 +2937,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB34_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB34_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2954,8 +2955,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB34_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB34_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB34_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB34_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 monotonic, align 4
@@ -2977,6 +2977,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB35_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB35_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2994,8 +2995,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB35_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB35_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB35_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB35_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -3013,6 +3013,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB35_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB35_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -3030,8 +3031,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB35_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB35_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB35_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB35_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 monotonic, align 4
diff --git a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
new file mode 100644
index 00000000000000..9bbb579b762e63
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
@@ -0,0 +1,213 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -o - %s -mtriple=loongarch64 \
+# RUN:  -run-pass=register-coalescer -join-liveintervals=1 -join-splitedges=0 | FileCheck %s
+
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: foo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $r4, $r5, $r6, $r7, $r8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $r8
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $r7
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $r6
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $r5
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $r4
+  ; CHECK-NEXT:   [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1
+  ; CHECK-NEXT:   [[ORI:%[0-9]+]]:gpr = ORI $r0, 1
+  ; CHECK-NEXT:   [[ANDI1:%[0-9]+]]:gpr = ANDI [[COPY2]], 1
+  ; CHECK-NEXT:   [[ANDI2:%[0-9]+]]:gpr = ANDI [[COPY1]], 1
+  ; CHECK-NEXT:   [[ANDI3:%[0-9]+]]:gpr = ANDI [[COPY]], 1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gpr = COPY [[COPY5]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQZ [[ANDI]], %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoBR %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x7c000000), %bb.6(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[LD_D:%[0-9]+]]:gpr = LD_D $r0, 8
+  ; CHECK-NEXT:   dead [[LD_D1:%[0-9]+]]:gpr = LD_D $r0, 0
+  ; CHECK-NEXT:   BNEZ [[ANDI1]], %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   PseudoBR %bb.11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x7c000000), %bb.10(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQZ [[ANDI2]], %bb.10
+  ; CHECK-NEXT:   PseudoBR %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x04000000), %bb.5(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
+  ; CHECK-NEXT:   BEQZ [[ANDI3]], %bb.5
+  ; CHECK-NEXT:   PseudoBR %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ST_B $r0, [[COPY4]], 0
+  ; CHECK-NEXT:   PseudoBR %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY [[ORI]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ST_D $r0, [[COPY4]], 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12:
+  ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQ [[COPY7]], [[ORI]], %bb.2
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  bb.0:
+    liveins: $r4, $r5, $r6, $r7, $r8
+
+    %0:gpr = COPY killed $r8
+    %1:gpr = COPY killed $r7
+    %2:gpr = COPY killed $r6
+    %3:gpr = COPY killed $r5
+    %4:gpr = COPY killed $r4
+    %5:gpr = COPY $r0
+    %6:gpr = COPY killed %5
+    %7:gpr = ANDI killed %3, 1
+    %8:gpr = ORI $r0, 1
+    %9:gpr = ANDI killed %2, 1
+    %10:gpr = ANDI killed %1, 1
+    %11:gpr = ANDI killed %0, 1
+    %12:gpr = COPY %6
+    %13:gpr = COPY killed %6
+    %14:gpr = IMPLICIT_DEF
+
+  bb.1:
+    %15:gpr = COPY killed %14
+    %16:gpr = COPY killed %13
+    %17:gpr = COPY killed %12
+    %18:gpr = COPY %17
+    %19:gpr = COPY %16
+    %20:gpr = COPY killed %16
+    %21:gpr = COPY killed %15
+
+  bb.2:
+    successors: %bb.3, %bb.4
+
+    %22:gpr = COPY killed %21
+    %23:gpr = COPY killed %20
+    %24:gpr = COPY killed %19
+    %25:gpr = COPY killed %18
+    BEQZ %7, %bb.4
+
+  bb.3:
+    %26:gpr = COPY killed %24
+    %27:gpr = COPY killed %23
+    PseudoBR %bb.9
+
+  bb.4:
+    %28:gpr = COPY killed %23
+
+  bb.5:
+    successors: %bb.7(0x7c000000), %bb.6(0x04000000)
+
+    %29:gpr = COPY killed %28
+    dead %30:gpr = LD_D $r0, 8
+    dead %31:gpr = LD_D $r0, 0
+    BNEZ %9, %bb.7
+
+  bb.6:
+    %32:gpr = COPY $r0
+    %33:gpr = COPY killed %32
+    %34:gpr = COPY killed %33
+    %35:gpr = COPY killed %22
+    PseudoBR %bb.11
+
+  bb.7:
+    successors: %bb.8(0x7c000000), %bb.10(0x04000000)
+
+    BEQZ %10, %bb.10
+    PseudoBR %bb.8
+
+  bb.8:
+    successors: %bb.9(0x04000000), %bb.5(0x7c000000)
+
+    %36:gpr = ADDI_D killed %29, 1
+    %28:gpr = COPY %36
+    %26:gpr = COPY %36
+    %27:gpr = COPY killed %36
+    BEQZ %11, %bb.5
+    PseudoBR %bb.9
+
+  bb.9:
+    %37:gpr = COPY killed %27
+    %38:gpr = COPY killed %26
+    %39:gpr = COPY $r0
+    ST_B killed %39, %4, 0
+    %40:gpr = COPY killed %25
+    %41:gpr = COPY killed %38
+    %42:gpr = COPY killed %37
+    %43:gpr = COPY killed %22
+    PseudoBR %bb.12
+
+  bb.10:
+    %44:gpr = ADDI_D killed %29, 1
+    %34:gpr = COPY %8
+    %35:gpr = COPY killed %44
+
+  bb.11:
+    %45:gpr = COPY killed %35
+    %46:gpr = COPY killed %34
+    %47:gpr = COPY $r0
+    ST_D killed %47, %4, 0
+    %40:gpr = COPY %45
+    %41:gpr = COPY %46
+    %42:gpr = COPY killed %46
+    %43:gpr = COPY killed %45
+
+  bb.12:
+    successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+
+    %48:gpr = COPY killed %43
+    %49:gpr = COPY killed %42
+    %50:gpr = COPY killed %41
+    %51:gpr = COPY killed %40
+    %12:gpr = COPY %51
+    %13:gpr = COPY %50
+    %14:gpr = COPY %48
+    %18:gpr = COPY killed %51
+    %19:gpr = COPY killed %50
+    %20:gpr = COPY killed %49
+    %21:gpr = COPY killed %48
+    BEQ %17, %8, %bb.2
+    PseudoBR %bb.1
+
+...
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/pr40325.ll b/llvm/test/CodeGen/Mips/Fast-ISel/pr40325.ll
index 9e64d7b2fa039b..c276515920d52a 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/pr40325.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/pr40325.ll
@@ -11,7 +11,7 @@ define void @test(i32 %x, ptr %p) nounwind {
 ; CHECK-NEXT:    andi $1, $4, 1
 ; CHECK-NEXT:    bgtz $1, $BB0_1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:  # %bb.1: # %foo
+; CHECK-NEXT:  $BB0_1: # %foo
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    nop
   %y = and i32 %x, 1
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll
index 4c10fedaa4a884..74765ff0e8f106 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll
@@ -25,7 +25,7 @@ define i32 @mod4_0_to_11(i32 %a) {
 ; MIPS32-NEXT:    sltu $1, $1, $2
 ; MIPS32-NEXT:    bnez $1, $BB0_6
 ; MIPS32-NEXT:    nop
-; MIPS32-NEXT:  $BB0_1: # %entry
+; MIPS32-NEXT:  # %bb.1: # %entry
 ; MIPS32-NEXT:    lw $2, 28($sp) # 4-byte Folded Reload
 ; MIPS32-NEXT:    lui $1, %hi($JTI0_0)
 ; MIPS32-NEXT:    sll $2, $2, 2
@@ -65,7 +65,7 @@ define i32 @mod4_0_to_11(i32 %a) {
 ; MIPS32-NEXT:    sltu $1, $1, $2
 ; MIPS32-NEXT:    bnez $1, $BB0_13
 ; MIPS32-NEXT:    nop
-; MIPS32-NEXT:  $BB0_8: # %sw.epilog
+; MIPS32-NEXT:  # %bb.8: # %sw.epilog
 ; MIPS32-NEXT:    lw $2, 0($sp) # 4-byte Folded Reload
 ; MIPS32-NEXT:    lui $1, %hi($JTI0_1)
 ; MIPS32-NEXT:    sll $2, $2, 2
@@ -125,7 +125,7 @@ define i32 @mod4_0_to_11(i32 %a) {
 ; MIPS32_PIC-NEXT:    sltu $1, $1, $2
 ; MIPS32_PIC-NEXT:    bnez $1, $BB0_6
 ; MIPS32_PIC-NEXT:    nop
-; MIPS32_PIC-NEXT:  $BB0_1: # %entry
+; MIPS32_PIC-NEXT:  # %bb.1: # %entry
 ; MIPS32_PIC-NEXT:    lw $2, 8($sp) # 4-byte Folded Reload
 ; MIPS32_PIC-NEXT:    lw $3, 36($sp) # 4-byte Folded Reload
 ; MIPS32_PIC-NEXT:    lw $1, %got($JTI0_0)($2)
@@ -167,7 +167,7 @@ define i32 @mod4_0_to_11(i32 %a) {
 ; MIPS32_PIC-NEXT:    sltu $1, $1, $2
 ; MIPS32_PIC-NEXT:    bnez $1, $BB0_13
 ; MIPS32_PIC-NEXT:    nop
-; MIPS32_PIC-NEXT:  $BB0_8: # %sw.epilog
+; MIPS32_PIC-NEXT:  # %bb.8: # %sw.epilog
 ; MIPS32_PIC-NEXT:    lw $2, 8($sp) # 4-byte Folded Reload
 ; MIPS32_PIC-NEXT:    lw $3, 4($sp) # 4-byte Folded Reload
 ; MIPS32_PIC-NEXT:    lw $1, %got($JTI0_1)($2)
diff --git a/llvm/test/CodeGen/Mips/compactbranches/unsafe-in-forbidden-slot.ll b/llvm/test/CodeGen/Mips/compactbranches/unsafe-in-forbidden-slot.ll
index cbd8b2370ac969..de61515cff9bcb 100644
--- a/llvm/test/CodeGen/Mips/compactbranches/unsafe-in-forbidden-slot.ll
+++ b/llvm/test/CodeGen/Mips/compactbranches/unsafe-in-forbidden-slot.ll
@@ -18,7 +18,7 @@ sw.bb:                                            ; preds = %entry
   br label %sw.epilog
 ; CHECK: beqzc
 ; CHECK-NEXT: nop
-; CHECK-NEXT: .LBB
+; CHECK-NEXT: # %bb.1
 ; CHECK-NEXT: j
 
 sw.bb1:                                           ; preds = %entry, %entry
@@ -26,7 +26,7 @@ sw.bb1:                                           ; preds = %entry, %entry
   br label %sw.epilog
 ; CHECK: bnezc
 ; CHECK-NEXT: nop
-; CHECK-NEXT: .LBB
+; CHECK-NEXT: # %bb.3
 ; CHECK-NEXT: j
 
 sw.epilog:                                        ; preds = %entry, %sw.bb1, %sw.bb
diff --git a/llvm/test/CodeGen/Mips/hf1_body.ll b/llvm/test/CodeGen/Mips/hf1_body.ll
index 184ea31bddc9d2..c3dea67896210a 100644
--- a/llvm/test/CodeGen/Mips/hf1_body.ll
+++ b/llvm/test/CodeGen/Mips/hf1_body.ll
@@ -23,8 +23,8 @@ entry:
 ; ALL:       .set reorder
 ; ALL:       .reloc 0, R_MIPS_NONE, v_sf
 ; GAS:       la $25, $__fn_local_v_sf
-; IAS:       lw $25, %got($$__fn_local_v_sf)($gp)
-; IAS:       addiu $25, $25, %lo($$__fn_local_v_sf)
+; IAS:       lw $25, %got($__fn_local_v_sf)($gp)
+; IAS:       addiu $25, $25, %lo($__fn_local_v_sf)
 ; ALL:       mfc1 $4, $f12
 ; ALL:       jr $25
 ; ALL:       .end __fn_stub_v_sf
diff --git a/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll b/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
index 1ce46cfa07cf87..634216027ef6bb 100644
--- a/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
+++ b/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
@@ -42,7 +42,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R2-NEXT:    sltiu $1, $4, 7
 ; MIPS32R2-NEXT:    beqz $1, $BB0_6
 ; MIPS32R2-NEXT:    sw $4, 4($sp)
-; MIPS32R2-NEXT:  $BB0_1: # %entry
+; MIPS32R2-NEXT:  # %bb.1: # %entry
 ; MIPS32R2-NEXT:    sll $1, $4, 2
 ; MIPS32R2-NEXT:    lui $2, %hi($JTI0_0)
 ; MIPS32R2-NEXT:    addu $1, $1, $2
@@ -100,7 +100,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R6-NEXT:    sltiu $1, $4, 7
 ; MIPS32R6-NEXT:    beqz $1, $BB0_6
 ; MIPS32R6-NEXT:    sw $4, 4($sp)
-; MIPS32R6-NEXT:  $BB0_1: # %entry
+; MIPS32R6-NEXT:  # %bb.1: # %entry
 ; MIPS32R6-NEXT:    sll $1, $4, 2
 ; MIPS32R6-NEXT:    lui $2, %hi($JTI0_0)
 ; MIPS32R6-NEXT:    addu $1, $1, $2
@@ -159,7 +159,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    sltiu $1, $2, 7
 ; MIPS64R2-NEXT:    beqz $1, .LBB0_6
 ; MIPS64R2-NEXT:    sw $4, 4($sp)
-; MIPS64R2-NEXT:  .LBB0_1: # %entry
+; MIPS64R2-NEXT:  # %bb.1: # %entry
 ; MIPS64R2-NEXT:    dsll $1, $2, 3
 ; MIPS64R2-NEXT:    lui $2, %highest(.LJTI0_0)
 ; MIPS64R2-NEXT:    daddiu $2, $2, %higher(.LJTI0_0)
@@ -254,7 +254,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    sltiu $1, $2, 7
 ; MIPS64R6-NEXT:    beqz $1, .LBB0_6
 ; MIPS64R6-NEXT:    sw $4, 4($sp)
-; MIPS64R6-NEXT:  .LBB0_1: # %entry
+; MIPS64R6-NEXT:  # %bb.1: # %entry
 ; MIPS64R6-NEXT:    dsll $1, $2, 3
 ; MIPS64R6-NEXT:    lui $2, %highest(.LJTI0_0)
 ; MIPS64R6-NEXT:    daddiu $2, $2, %higher(.LJTI0_0)
@@ -351,7 +351,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R2-NEXT:    sltiu $1, $4, 7
 ; PIC-MIPS32R2-NEXT:    beqz $1, $BB0_6
 ; PIC-MIPS32R2-NEXT:    sw $4, 4($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_1: # %entry
+; PIC-MIPS32R2-NEXT:  # %bb.1: # %entry
 ; PIC-MIPS32R2-NEXT:    sll $1, $4, 2
 ; PIC-MIPS32R2-NEXT:    lw $3, %got($JTI0_0)($2)
 ; PIC-MIPS32R2-NEXT:    addu $1, $1, $3
@@ -413,7 +413,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R6-NEXT:    sltiu $1, $4, 7
 ; PIC-MIPS32R6-NEXT:    beqz $1, $BB0_6
 ; PIC-MIPS32R6-NEXT:    sw $4, 4($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_1: # %entry
+; PIC-MIPS32R6-NEXT:  # %bb.1: # %entry
 ; PIC-MIPS32R6-NEXT:    sll $1, $4, 2
 ; PIC-MIPS32R6-NEXT:    lw $3, %got($JTI0_0)($2)
 ; PIC-MIPS32R6-NEXT:    addu $1, $1, $3
@@ -476,7 +476,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R2-NEXT:    sltiu $1, $3, 7
 ; PIC-MIPS64R2-NEXT:    beqz $1, .LBB0_6
 ; PIC-MIPS64R2-NEXT:    sw $4, 4($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_1: # %entry
+; PIC-MIPS64R2-NEXT:  # %bb.1: # %entry
 ; PIC-MIPS64R2-NEXT:    dsll $1, $3, 3
 ; PIC-MIPS64R2-NEXT:    ld $3, %got_page(.LJTI0_0)($2)
 ; PIC-MIPS64R2-NEXT:    daddu $1, $1, $3
@@ -539,7 +539,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R6-NEXT:    sltiu $1, $3, 7
 ; PIC-MIPS64R6-NEXT:    beqz $1, .LBB0_6
 ; PIC-MIPS64R6-NEXT:    sw $4, 4($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_1: # %entry
+; PIC-MIPS64R6-NEXT:  # %bb.1: # %entry
 ; PIC-MIPS64R6-NEXT:    dsll $1, $3, 3
 ; PIC-MIPS64R6-NEXT:    ld $3, %got_page(.LJTI0_0)($2)
 ; PIC-MIPS64R6-NEXT:    daddu $1, $1, $3
diff --git a/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll b/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll
new file mode 100644
index 00000000000000..705570f808ce00
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=mips < %s | FileCheck %s --check-prefix=MIPS32
+; RUN: llc -march=mips64 < %s | FileCheck %s --check-prefix=MIPS64
+
+define dso_local void @read_double(ptr nocapture noundef readonly %0) local_unnamed_addr #0 {
+; MIPS32-LABEL: read_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lw $2, 4($4)
+; MIPS32-NEXT:    lw $3, 0($4)
+; MIPS32-NEXT:    #APP
+; MIPS32-NEXT:    #NO_APP
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+;
+; MIPS64-LABEL: read_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    ld $2, 0($4)
+; MIPS64-NEXT:    #APP
+; MIPS64-NEXT:    #NO_APP
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    nop
+  %2 = load double, ptr %0, align 8
+  tail call void asm sideeffect "", "r,~{$1}"(double %2)
+  ret void
+}
+
+define dso_local void @read_float(ptr nocapture noundef readonly %0) local_unnamed_addr #0 {
+; MIPS32-LABEL: read_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lw $2, 0($4)
+; MIPS32-NEXT:    #APP
+; MIPS32-NEXT:    #NO_APP
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+;
+; MIPS64-LABEL: read_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    lw $2, 0($4)
+; MIPS64-NEXT:    #APP
+; MIPS64-NEXT:    #NO_APP
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    nop
+  %2 = load float, ptr %0, align 8
+  tail call void asm sideeffect "", "r,~{$1}"(float %2)
+  ret void
+}
+
+attributes #0 = { "target-features"="+soft-float" "use-soft-float"="true" }
diff --git a/llvm/test/CodeGen/Mips/jump-table-mul.ll b/llvm/test/CodeGen/Mips/jump-table-mul.ll
index 22f41f53d154bf..cca6080a07544c 100644
--- a/llvm/test/CodeGen/Mips/jump-table-mul.ll
+++ b/llvm/test/CodeGen/Mips/jump-table-mul.ll
@@ -10,7 +10,7 @@ define i64 @test(i64 %arg) {
 ; CHECK-NEXT:    sltiu $1, $4, 11
 ; CHECK-NEXT:    beqz $1, .LBB0_4
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:  .LBB0_1: # %entry
+; CHECK-NEXT:  # %bb.1: # %entry
 ; CHECK-NEXT:    daddiu $1, $2, %lo(%neg(%gp_rel(test)))
 ; CHECK-NEXT:    dsll $2, $4, 3
 ; CHECK-NEXT:    ld $3, %got_page(.LJTI0_0)($1)
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll b/llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll
new file mode 100644
index 00000000000000..3e6826f0cd1d13
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll
@@ -0,0 +1,71 @@
+target triple = "mipsisa32r6el-unknown-linux-gnu"
+
+; RUN: llc -filetype=asm %s -o - | FileCheck %s --check-prefix=MIPSELR6
+; Function Attrs: noinline nounwind optnone uwtable
+define i1 @foo0() nounwind {
+; MIPSELR6:      bnezc	$1, $BB0_2
+; MIPSELR6-NEXT: nop
+; MIPSELR6:      jr	$ra
+entry:
+  %0 = icmp eq i32 0, 1
+  br i1 %0, label %2, label %3
+  ret i1 %0
+2:                                                
+  ret i1 %0
+3:                                                
+  ret i1 %0
+}
+
+define i32 @foo1() nounwind {
+; MIPSELR6:      addiu	$2, $2, 1
+; MIPSELR6-NEXT: .set	noreorder
+; MIPSELR6-NEXT: beqzc	$2, $tmp0
+; MIPSELR6-NEXT: nop
+; MIPSELR6-NEXT: .set	reorder
+; MIPSELR6:      jrc	$ra
+entry:
+  %0 = tail call i32 asm "1: addiu $0, $0, 1; beqzc $0, 1b", "=r"() nounwind
+  ret i32 %0
+}
+
+define i32 @foo2() nounwind {
+; MIPSELR6:      .set	push
+; MIPSELR6-NEXT: .set	at
+; MIPSELR6-NEXT: .set	macro
+; MIPSELR6-NEXT: .set	reorder
+; MIPSELR6:      .set	noreorder
+; MIPSELR6-NEXT: beqzc	$9, End
+; MIPSELR6-NEXT: nop
+; MIPSELR6-NEXT: .set	reorder
+; MIPSELR6:      addiu	$9, $9, 1
+entry:
+  %0 = tail call i32 asm "beqzc $$t1, End", "=r"() nounwind
+  %1 = tail call i32 asm "addiu $$t1, $$t1, 1", "=r"() nounwind
+  %2 = add nsw i32 %1, %0
+  ret i32 %2
+}
+
+define i32 @foo3() nounwind {
+; MIPSELR6:      addiu	$2, $2, 1
+; MIPSELR6-NEXT: .set	noreorder
+; MIPSELR6-NEXT: beqzc	$2, $tmp1
+; MIPSELR6-NEXT: nop
+; MIPSELR6-NEXT: .set	noreorder
+; MIPSELR6-NEXT: j	End
+; MIPSELR6-NEXT: nop
+; MIPSELR6-NEXT: .set	reorder
+entry:
+  %0 = tail call i32 asm "1: addiu $0, $0, 1; beqzc $0, 1b; j End", "=r"() nounwind
+  ret i32 %0
+}
+
+define i32 @foo4() nounwind {
+; MIPSELR6:      addiu	$2, $2, 1
+; MIPSELR6-NEXT: .set	noreorder
+; MIPSELR6-NEXT: beqzc  $2, $tmp2
+; MIPSELR6-NEXT: addiu	$2, $2, 1
+; MIPSELR6-NEXT: .set	reorder
+entry:
+  %0 = tail call i32 asm "1: addiu $0, $0, 1; beqzc $0, 1b; addiu $0, $0, 1", "=r"() nounwind
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll b/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll
index afb79e55f4f90b..5d7cdbc0b69edf 100644
--- a/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll
+++ b/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll
@@ -14,7 +14,7 @@ define i32 @test(i32 signext %x, i32 signext %c) {
 ; CHECK-NEXT:    sltiu $1, $5, 4
 ; CHECK-NEXT:    beqz $1, $BB0_6
 ; CHECK-NEXT:    addu $3, $2, $25
-; CHECK-NEXT:  $BB0_1: # %entry
+; CHECK-NEXT:  # %bb.1: # %entry
 ; CHECK-NEXT:    li16 $2, 0
 ; CHECK-NEXT:    sll16 $5, $5, 2
 ; CHECK-NEXT:    lw $6, %got($JTI0_0)($3)
diff --git a/llvm/test/CodeGen/PowerPC/crsave.ll b/llvm/test/CodeGen/PowerPC/crsave.ll
index 81e7a0adcc8ca1..05da108e1fbf87 100644
--- a/llvm/test/CodeGen/PowerPC/crsave.ll
+++ b/llvm/test/CodeGen/PowerPC/crsave.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -O0 -frame-pointer=all -mtriple=powerpc-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC32
 ; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC64
 ; RUN: llc -O0 -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s -check-prefix=PPC64-ELFv2
@@ -5,6 +6,102 @@
 declare void @foo()
 
 define i32 @test_cr2() nounwind uwtable {
+; PPC32-LABEL: test_cr2:
+; PPC32:       # %bb.0: # %entry
+; PPC32-NEXT:    mflr 0
+; PPC32-NEXT:    stwu 1, -32(1)
+; PPC32-NEXT:    stw 31, 28(1)
+; PPC32-NEXT:    stw 0, 36(1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 32
+; PPC32-NEXT:    .cfi_offset r31, -4
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    mr 31, 1
+; PPC32-NEXT:    .cfi_def_cfa_register r31
+; PPC32-NEXT:    .cfi_offset cr2, -8
+; PPC32-NEXT:    mfcr 12
+; PPC32-NEXT:    stw 12, 24(31)
+; PPC32-NEXT:    li 3, 1
+; PPC32-NEXT:    li 4, 2
+; PPC32-NEXT:    li 5, 3
+; PPC32-NEXT:    li 6, 0
+; PPC32-NEXT:    #APP
+; PPC32-EMPTY:
+; PPC32-NEXT:    mtcr 6
+; PPC32-NEXT:    cmpw 2, 4, 3
+; PPC32-NEXT:    mfcr 3
+; PPC32-NEXT:    #NO_APP
+; PPC32-NEXT:    stw 3, 20(31)
+; PPC32-NEXT:    bl foo
+; PPC32-NEXT:    lwz 3, 20(31)
+; PPC32-NEXT:    lwz 12, 24(31)
+; PPC32-NEXT:    mtocrf 32, 12
+; PPC32-NEXT:    lwz 0, 36(1)
+; PPC32-NEXT:    lwz 31, 28(1)
+; PPC32-NEXT:    addi 1, 1, 32
+; PPC32-NEXT:    mtlr 0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: test_cr2:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    mflr 0
+; PPC64-NEXT:    mfcr 12
+; PPC64-NEXT:    stw 12, 8(1)
+; PPC64-NEXT:    stdu 1, -128(1)
+; PPC64-NEXT:    std 0, 144(1)
+; PPC64-NEXT:    .cfi_def_cfa_offset 128
+; PPC64-NEXT:    .cfi_offset lr, 16
+; PPC64-NEXT:    .cfi_offset cr2, 8
+; PPC64-NEXT:    li 3, 1
+; PPC64-NEXT:    li 4, 2
+; PPC64-NEXT:    li 5, 3
+; PPC64-NEXT:    li 6, 0
+; PPC64-NEXT:    #APP
+; PPC64-EMPTY:
+; PPC64-NEXT:    mtcr 6
+; PPC64-NEXT:    cmpw 2, 4, 3
+; PPC64-NEXT:    mfcr 3
+; PPC64-NEXT:    #NO_APP
+; PPC64-NEXT:    stw 3, 124(1)
+; PPC64-NEXT:    bl foo
+; PPC64-NEXT:    nop
+; PPC64-NEXT:    lwz 3, 124(1)
+; PPC64-NEXT:    addi 1, 1, 128
+; PPC64-NEXT:    ld 0, 16(1)
+; PPC64-NEXT:    lwz 12, 8(1)
+; PPC64-NEXT:    mtocrf 32, 12
+; PPC64-NEXT:    mtlr 0
+; PPC64-NEXT:    blr
+;
+; PPC64-ELFv2-LABEL: test_cr2:
+; PPC64-ELFv2:       # %bb.0: # %entry
+; PPC64-ELFv2-NEXT:    mflr 0
+; PPC64-ELFv2-NEXT:    mfocrf 12, 32
+; PPC64-ELFv2-NEXT:    stw 12, 8(1)
+; PPC64-ELFv2-NEXT:    stdu 1, -112(1)
+; PPC64-ELFv2-NEXT:    std 0, 128(1)
+; PPC64-ELFv2-NEXT:    .cfi_def_cfa_offset 112
+; PPC64-ELFv2-NEXT:    .cfi_offset lr, 16
+; PPC64-ELFv2-NEXT:    .cfi_offset cr2, 8
+; PPC64-ELFv2-NEXT:    li 3, 1
+; PPC64-ELFv2-NEXT:    li 4, 2
+; PPC64-ELFv2-NEXT:    li 5, 3
+; PPC64-ELFv2-NEXT:    li 6, 0
+; PPC64-ELFv2-NEXT:    #APP
+; PPC64-ELFv2-EMPTY:
+; PPC64-ELFv2-NEXT:    mtcr 6
+; PPC64-ELFv2-NEXT:    cmpw 2, 4, 3
+; PPC64-ELFv2-NEXT:    mfcr 3
+; PPC64-ELFv2-NEXT:    #NO_APP
+; PPC64-ELFv2-NEXT:    stw 3, 108(1)
+; PPC64-ELFv2-NEXT:    bl foo
+; PPC64-ELFv2-NEXT:    nop
+; PPC64-ELFv2-NEXT:    lwz 3, 108(1)
+; PPC64-ELFv2-NEXT:    addi 1, 1, 112
+; PPC64-ELFv2-NEXT:    ld 0, 16(1)
+; PPC64-ELFv2-NEXT:    lwz 12, 8(1)
+; PPC64-ELFv2-NEXT:    mtocrf 32, 12
+; PPC64-ELFv2-NEXT:    mtlr 0
+; PPC64-ELFv2-NEXT:    blr
 entry:
   %ret = alloca i32, align 4
   %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmpw 2,$2,$1\0A\09mfcr $0", "=r,r,r,r,r,~{cr2}"(i32 1, i32 2, i32 3, i32 0) nounwind
@@ -14,27 +111,104 @@ entry:
   ret i32 %1
 }
 
-; PPC32-LABEL: test_cr2:
-; PPC32: stwu 1, -32(1)
-; PPC32: stw 31, 28(1)
-; PPC32: mfcr 12
-; PPC32-NEXT: stw 12, 24(31)
-; PPC32: lwz 12, 24(31)
-; PPC32-NEXT: mtocrf 32, 12
-
-; PPC64: .cfi_startproc
-; PPC64: mfcr 12
-; PPC64: stw 12, 8(1)
-; PPC64: stdu 1, -[[AMT:[0-9]+]](1)
-; PPC64: .cfi_def_cfa_offset 128
-; PPC64: .cfi_offset lr, 16
-; PPC64: .cfi_offset cr2, 8
-; PPC64: addi 1, 1, [[AMT]]
-; PPC64: lwz 12, 8(1)
-; PPC64: mtocrf 32, 12
-; PPC64: .cfi_endproc
-
 define i32 @test_cr234() nounwind {
+; PPC32-LABEL: test_cr234:
+; PPC32:       # %bb.0: # %entry
+; PPC32-NEXT:    mflr 0
+; PPC32-NEXT:    stwu 1, -32(1)
+; PPC32-NEXT:    stw 31, 28(1)
+; PPC32-NEXT:    stw 0, 36(1)
+; PPC32-NEXT:    mr 31, 1
+; PPC32-NEXT:    mfcr 12
+; PPC32-NEXT:    stw 12, 24(31)
+; PPC32-NEXT:    li 3, 1
+; PPC32-NEXT:    li 4, 2
+; PPC32-NEXT:    li 5, 3
+; PPC32-NEXT:    li 6, 0
+; PPC32-NEXT:    #APP
+; PPC32-EMPTY:
+; PPC32-NEXT:    mtcr 6
+; PPC32-NEXT:    cmpw 2, 4, 3
+; PPC32-NEXT:    cmpw 3, 4, 4
+; PPC32-NEXT:    cmpw 4, 4, 5
+; PPC32-NEXT:    mfcr 3
+; PPC32-NEXT:    #NO_APP
+; PPC32-NEXT:    stw 3, 20(31)
+; PPC32-NEXT:    bl foo
+; PPC32-NEXT:    lwz 3, 20(31)
+; PPC32-NEXT:    lwz 12, 24(31)
+; PPC32-NEXT:    mtocrf 32, 12
+; PPC32-NEXT:    mtocrf 16, 12
+; PPC32-NEXT:    mtocrf 8, 12
+; PPC32-NEXT:    lwz 0, 36(1)
+; PPC32-NEXT:    lwz 31, 28(1)
+; PPC32-NEXT:    addi 1, 1, 32
+; PPC32-NEXT:    mtlr 0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: test_cr234:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    mflr 0
+; PPC64-NEXT:    mfcr 12
+; PPC64-NEXT:    stw 12, 8(1)
+; PPC64-NEXT:    stdu 1, -128(1)
+; PPC64-NEXT:    std 0, 144(1)
+; PPC64-NEXT:    li 3, 1
+; PPC64-NEXT:    li 4, 2
+; PPC64-NEXT:    li 5, 3
+; PPC64-NEXT:    li 6, 0
+; PPC64-NEXT:    #APP
+; PPC64-EMPTY:
+; PPC64-NEXT:    mtcr 6
+; PPC64-NEXT:    cmpw 2, 4, 3
+; PPC64-NEXT:    cmpw 3, 4, 4
+; PPC64-NEXT:    cmpw 4, 4, 5
+; PPC64-NEXT:    mfcr 3
+; PPC64-NEXT:    #NO_APP
+; PPC64-NEXT:    stw 3, 124(1)
+; PPC64-NEXT:    bl foo
+; PPC64-NEXT:    nop
+; PPC64-NEXT:    lwz 3, 124(1)
+; PPC64-NEXT:    addi 1, 1, 128
+; PPC64-NEXT:    ld 0, 16(1)
+; PPC64-NEXT:    lwz 12, 8(1)
+; PPC64-NEXT:    mtocrf 32, 12
+; PPC64-NEXT:    mtocrf 16, 12
+; PPC64-NEXT:    mtocrf 8, 12
+; PPC64-NEXT:    mtlr 0
+; PPC64-NEXT:    blr
+;
+; PPC64-ELFv2-LABEL: test_cr234:
+; PPC64-ELFv2:       # %bb.0: # %entry
+; PPC64-ELFv2-NEXT:    mflr 0
+; PPC64-ELFv2-NEXT:    mfcr 12
+; PPC64-ELFv2-NEXT:    stw 12, 8(1)
+; PPC64-ELFv2-NEXT:    stdu 1, -112(1)
+; PPC64-ELFv2-NEXT:    std 0, 128(1)
+; PPC64-ELFv2-NEXT:    li 3, 1
+; PPC64-ELFv2-NEXT:    li 4, 2
+; PPC64-ELFv2-NEXT:    li 5, 3
+; PPC64-ELFv2-NEXT:    li 6, 0
+; PPC64-ELFv2-NEXT:    #APP
+; PPC64-ELFv2-EMPTY:
+; PPC64-ELFv2-NEXT:    mtcr 6
+; PPC64-ELFv2-NEXT:    cmpw 2, 4, 3
+; PPC64-ELFv2-NEXT:    cmpw 3, 4, 4
+; PPC64-ELFv2-NEXT:    cmpw 4, 4, 5
+; PPC64-ELFv2-NEXT:    mfcr 3
+; PPC64-ELFv2-NEXT:    #NO_APP
+; PPC64-ELFv2-NEXT:    stw 3, 108(1)
+; PPC64-ELFv2-NEXT:    bl foo
+; PPC64-ELFv2-NEXT:    nop
+; PPC64-ELFv2-NEXT:    lwz 3, 108(1)
+; PPC64-ELFv2-NEXT:    addi 1, 1, 112
+; PPC64-ELFv2-NEXT:    ld 0, 16(1)
+; PPC64-ELFv2-NEXT:    lwz 12, 8(1)
+; PPC64-ELFv2-NEXT:    mtocrf 32, 12
+; PPC64-ELFv2-NEXT:    mtocrf 16, 12
+; PPC64-ELFv2-NEXT:    mtocrf 8, 12
+; PPC64-ELFv2-NEXT:    mtlr 0
+; PPC64-ELFv2-NEXT:    blr
 entry:
   %ret = alloca i32, align 4
   %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmpw 2,$2,$1\0A\09cmpw 3,$2,$2\0A\09cmpw 4,$2,$3\0A\09mfcr $0", "=r,r,r,r,r,~{cr2},~{cr3},~{cr4}"(i32 1, i32 2, i32 3, i32 0) nounwind
@@ -44,41 +218,106 @@ entry:
   ret i32 %1
 }
 
-; PPC32-LABEL: test_cr234:
-; PPC32: stwu 1, -32(1)
-; PPC32: stw 31, 28(1)
-; PPC32: mfcr 12
-; PPC32-NEXT: stw 12, 24(31)
-; PPC32: lwz 12, 24(31)
-; PPC32-NEXT: mtocrf 32, 12
-; PPC32-NEXT: mtocrf 16, 12
-; PPC32-NEXT: mtocrf 8, 12
-
-; PPC64: mfcr 12
-; PPC64: stw 12, 8(1)
-; PPC64: stdu 1, -[[AMT:[0-9]+]](1)
-; PPC64: addi 1, 1, [[AMT]]
-; PPC64: lwz 12, 8(1)
-; PPC64: mtocrf 32, 12
-; PPC64: mtocrf 16, 12
-; PPC64: mtocrf 8, 12
-
 ; Generate mfocrf in prologue when we need to save 1 nonvolatile CR field
 define void @cloberOneNvCrField() {
+; PPC32-LABEL: cloberOneNvCrField:
+; PPC32:       # %bb.0: # %entry
+; PPC32-NEXT:    stwu 1, -32(1)
+; PPC32-NEXT:    stw 31, 28(1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 32
+; PPC32-NEXT:    .cfi_offset r31, -4
+; PPC32-NEXT:    mr 31, 1
+; PPC32-NEXT:    .cfi_def_cfa_register r31
+; PPC32-NEXT:    .cfi_offset cr2, -8
+; PPC32-NEXT:    mfcr 12
+; PPC32-NEXT:    stw 12, 24(31)
+; PPC32-NEXT:    #APP
+; PPC32-NEXT:    # clobbers
+; PPC32-NEXT:    #NO_APP
+; PPC32-NEXT:    lwz 12, 24(31)
+; PPC32-NEXT:    mtocrf 32, 12
+; PPC32-NEXT:    lwz 31, 28(1)
+; PPC32-NEXT:    addi 1, 1, 32
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: cloberOneNvCrField:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    mfcr 12
+; PPC64-NEXT:    stw 12, 8(1)
+; PPC64-NEXT:    #APP
+; PPC64-NEXT:    # clobbers
+; PPC64-NEXT:    #NO_APP
+; PPC64-NEXT:    lwz 12, 8(1)
+; PPC64-NEXT:    mtocrf 32, 12
+; PPC64-NEXT:    blr
+;
+; PPC64-ELFv2-LABEL: cloberOneNvCrField:
+; PPC64-ELFv2:       # %bb.0: # %entry
+; PPC64-ELFv2-NEXT:    mfocrf 12, 32
+; PPC64-ELFv2-NEXT:    stw 12, 8(1)
+; PPC64-ELFv2-NEXT:    #APP
+; PPC64-ELFv2-NEXT:    # clobbers
+; PPC64-ELFv2-NEXT:    #NO_APP
+; PPC64-ELFv2-NEXT:    lwz 12, 8(1)
+; PPC64-ELFv2-NEXT:    mtocrf 32, 12
+; PPC64-ELFv2-NEXT:    blr
 entry:
   tail call void asm sideeffect "# clobbers", "~{cr2}"()
   ret void
-
-; PPC64-ELFv2-LABEL: @cloberOneNvCrField
-; PPC64-ELFv2: mfocrf [[REG1:[0-9]+]], 32
 }
 
 ; Generate mfcr in prologue when we need to save all nonvolatile CR field
 define void @cloberAllNvCrField() {
+; PPC32-LABEL: cloberAllNvCrField:
+; PPC32:       # %bb.0: # %entry
+; PPC32-NEXT:    stwu 1, -32(1)
+; PPC32-NEXT:    stw 31, 28(1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 32
+; PPC32-NEXT:    .cfi_offset r31, -4
+; PPC32-NEXT:    mr 31, 1
+; PPC32-NEXT:    .cfi_def_cfa_register r31
+; PPC32-NEXT:    .cfi_offset cr2, -8
+; PPC32-NEXT:    .cfi_offset cr3, -8
+; PPC32-NEXT:    .cfi_offset cr4, -8
+; PPC32-NEXT:    mfcr 12
+; PPC32-NEXT:    stw 12, 24(31)
+; PPC32-NEXT:    #APP
+; PPC32-NEXT:    # clobbers
+; PPC32-NEXT:    #NO_APP
+; PPC32-NEXT:    lwz 12, 24(31)
+; PPC32-NEXT:    mtocrf 32, 12
+; PPC32-NEXT:    mtocrf 16, 12
+; PPC32-NEXT:    mtocrf 8, 12
+; PPC32-NEXT:    lwz 31, 28(1)
+; PPC32-NEXT:    addi 1, 1, 32
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: cloberAllNvCrField:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    mfcr 12
+; PPC64-NEXT:    stw 12, 8(1)
+; PPC64-NEXT:    #APP
+; PPC64-NEXT:    # clobbers
+; PPC64-NEXT:    #NO_APP
+; PPC64-NEXT:    lwz 12, 8(1)
+; PPC64-NEXT:    mtocrf 32, 12
+; PPC64-NEXT:    mtocrf 16, 12
+; PPC64-NEXT:    mtocrf 8, 12
+; PPC64-NEXT:    blr
+;
+; PPC64-ELFv2-LABEL: cloberAllNvCrField:
+; PPC64-ELFv2:       # %bb.0: # %entry
+; PPC64-ELFv2-NEXT:    mfcr 12
+; PPC64-ELFv2-NEXT:    stw 12, 8(1)
+; PPC64-ELFv2-NEXT:    #APP
+; PPC64-ELFv2-NEXT:    # clobbers
+; PPC64-ELFv2-NEXT:    #NO_APP
+; PPC64-ELFv2-NEXT:    lwz 12, 8(1)
+; PPC64-ELFv2-NEXT:    mtocrf 32, 12
+; PPC64-ELFv2-NEXT:    mtocrf 16, 12
+; PPC64-ELFv2-NEXT:    mtocrf 8, 12
+; PPC64-ELFv2-NEXT:    blr
 entry:
   tail call void asm sideeffect "# clobbers", "~{cr2},~{cr3},~{cr4}"()
   ret void
-
-; PPC64-ELFv2-LABEL: @cloberAllNvCrField
-; PPC64-ELFv2: mfcr [[REG1:[0-9]+]]
 }
diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll
new file mode 100644
index 00000000000000..3e328c6ad9f0ba
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr59074.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s --check-prefix=LE64
+; RUN: llc -mtriple=powerpcle-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s --check-prefix=LE32
+; RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 < %s | FileCheck %s --check-prefix=BE64
+; RUN: llc -mtriple=powerpc-ibm-aix -mcpu=pwr7 < %s | FileCheck %s --check-prefix=BE32
+
+; To verify this doesn't crash due to array out of bound.
+define void @pr59074(ptr %0) {
+; LE64-LABEL: pr59074:
+; LE64:       # %bb.0: # %entry
+; LE64-NEXT:    lwz 6, 0(3)
+; LE64-NEXT:    li 7, 12
+; LE64-NEXT:    ld 4, 16(3)
+; LE64-NEXT:    ld 5, 24(3)
+; LE64-NEXT:    addi 6, 6, -12
+; LE64-NEXT:    std 4, 16(3)
+; LE64-NEXT:    std 5, 24(3)
+; LE64-NEXT:    srd 6, 7, 6
+; LE64-NEXT:    li 7, 0
+; LE64-NEXT:    std 7, 8(3)
+; LE64-NEXT:    std 6, 0(3)
+; LE64-NEXT:    blr
+;
+; LE32-LABEL: pr59074:
+; LE32:       # %bb.0: # %entry
+; LE32-NEXT:    stwu 1, -80(1)
+; LE32-NEXT:    .cfi_def_cfa_offset 80
+; LE32-NEXT:    lwz 4, 0(3)
+; LE32-NEXT:    xxlxor 0, 0, 0
+; LE32-NEXT:    li 5, 4
+; LE32-NEXT:    addi 6, 1, 16
+; LE32-NEXT:    li 7, 0
+; LE32-NEXT:    li 8, 12
+; LE32-NEXT:    xxswapd 0, 0
+; LE32-NEXT:    addi 4, 4, -12
+; LE32-NEXT:    rlwinm 9, 4, 29, 28, 31
+; LE32-NEXT:    stxvd2x 0, 6, 5
+; LE32-NEXT:    stw 7, 44(1)
+; LE32-NEXT:    stw 7, 40(1)
+; LE32-NEXT:    stw 7, 36(1)
+; LE32-NEXT:    stw 8, 16(1)
+; LE32-NEXT:    lwzux 5, 9, 6
+; LE32-NEXT:    li 6, 7
+; LE32-NEXT:    lwz 7, 8(9)
+; LE32-NEXT:    nand 6, 4, 6
+; LE32-NEXT:    lwz 8, 4(9)
+; LE32-NEXT:    clrlwi 4, 4, 29
+; LE32-NEXT:    lwz 9, 12(9)
+; LE32-NEXT:    clrlwi 6, 6, 27
+; LE32-NEXT:    subfic 11, 4, 32
+; LE32-NEXT:    srw 5, 5, 4
+; LE32-NEXT:    slwi 10, 7, 1
+; LE32-NEXT:    srw 7, 7, 4
+; LE32-NEXT:    slw 6, 10, 6
+; LE32-NEXT:    srw 10, 8, 4
+; LE32-NEXT:    slw 8, 8, 11
+; LE32-NEXT:    slw 11, 9, 11
+; LE32-NEXT:    srw 4, 9, 4
+; LE32-NEXT:    or 5, 8, 5
+; LE32-NEXT:    or 7, 11, 7
+; LE32-NEXT:    or 6, 10, 6
+; LE32-NEXT:    stw 4, 12(3)
+; LE32-NEXT:    stw 7, 8(3)
+; LE32-NEXT:    stw 5, 0(3)
+; LE32-NEXT:    stw 6, 4(3)
+; LE32-NEXT:    addi 1, 1, 80
+; LE32-NEXT:    blr
+;
+; BE64-LABEL: pr59074:
+; BE64:       # %bb.0: # %entry
+; BE64-NEXT:    lwz 6, 12(3)
+; BE64-NEXT:    li 7, 12
+; BE64-NEXT:    ld 4, 24(3)
+; BE64-NEXT:    ld 5, 16(3)
+; BE64-NEXT:    addi 6, 6, -12
+; BE64-NEXT:    std 4, 24(3)
+; BE64-NEXT:    std 5, 16(3)
+; BE64-NEXT:    srd 6, 7, 6
+; BE64-NEXT:    li 7, 0
+; BE64-NEXT:    std 7, 0(3)
+; BE64-NEXT:    std 6, 8(3)
+; BE64-NEXT:    blr
+;
+; BE32-LABEL: pr59074:
+; BE32:       # %bb.0: # %entry
+; BE32-NEXT:    lwz 4, 12(3)
+; BE32-NEXT:    xxlxor 0, 0, 0
+; BE32-NEXT:    addi 5, 1, -64
+; BE32-NEXT:    li 6, 12
+; BE32-NEXT:    li 7, 0
+; BE32-NEXT:    addi 8, 1, -48
+; BE32-NEXT:    li 10, 7
+; BE32-NEXT:    stxvw4x 0, 0, 5
+; BE32-NEXT:    addi 4, 4, -12
+; BE32-NEXT:    stw 6, -36(1)
+; BE32-NEXT:    stw 7, -40(1)
+; BE32-NEXT:    stw 7, -44(1)
+; BE32-NEXT:    rlwinm 9, 4, 29, 28, 31
+; BE32-NEXT:    stw 7, -48(1)
+; BE32-NEXT:    sub 5, 8, 9
+; BE32-NEXT:    nand 6, 4, 10
+; BE32-NEXT:    clrlwi 4, 4, 29
+; BE32-NEXT:    clrlwi 6, 6, 27
+; BE32-NEXT:    lwz 7, 4(5)
+; BE32-NEXT:    lwz 8, 8(5)
+; BE32-NEXT:    lwz 9, 0(5)
+; BE32-NEXT:    lwz 5, 12(5)
+; BE32-NEXT:    slwi 10, 7, 1
+; BE32-NEXT:    srw 11, 8, 4
+; BE32-NEXT:    srw 7, 7, 4
+; BE32-NEXT:    srw 5, 5, 4
+; BE32-NEXT:    slw 6, 10, 6
+; BE32-NEXT:    subfic 10, 4, 32
+; BE32-NEXT:    srw 4, 9, 4
+; BE32-NEXT:    slw 8, 8, 10
+; BE32-NEXT:    slw 10, 9, 10
+; BE32-NEXT:    or 6, 11, 6
+; BE32-NEXT:    or 7, 10, 7
+; BE32-NEXT:    or 5, 8, 5
+; BE32-NEXT:    stw 4, 0(3)
+; BE32-NEXT:    stw 6, 8(3)
+; BE32-NEXT:    stw 5, 12(3)
+; BE32-NEXT:    stw 7, 4(3)
+; BE32-NEXT:    blr
+entry:
+  %v1 = load <2 x i128>, <2 x i128>* %0
+  %v2 = insertelement <2 x i128> %v1, i128 12, i32 0
+  %v3 = sub <2 x i128> %v1, %v2
+  %v4 = lshr <2 x i128> %v2, %v3
+  store <2 x i128> %v4, <2 x i128>* %0
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll b/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll
index 6f68679325c579..798637b6840f1e 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll
@@ -7281,3 +7281,61 @@ entry:
   store double %str, ptr inttoptr (i64 1000000000000 to ptr), align 4096
   ret void
 }
+
+define dso_local void @st_reversed_double_from_i8(ptr %ptr) {
+; CHECK-P10-LABEL: st_reversed_double_from_i8:
+; CHECK-P10:       # %bb.0: # %entry
+; CHECK-P10-NEXT:    li r4, 8
+; CHECK-P10-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P10-NEXT:    xxspltidp vs2, -1023410176
+; CHECK-P10-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P10-NEXT:    xscvuxddp f0, f0
+; CHECK-P10-NEXT:    xscvuxddp f1, f1
+; CHECK-P10-NEXT:    xsadddp f0, f0, f2
+; CHECK-P10-NEXT:    xsadddp f1, f1, f2
+; CHECK-P10-NEXT:    stfd f1, 0(r3)
+; CHECK-P10-NEXT:    stfd f0, 8(r3)
+; CHECK-P10-NEXT:    blr
+;
+; CHECK-P9-LABEL: st_reversed_double_from_i8:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    li r4, 8
+; CHECK-P9-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P9-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P9-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P9-NEXT:    xscvuxddp f0, f0
+; CHECK-P9-NEXT:    xscvuxddp f1, f1
+; CHECK-P9-NEXT:    xsadddp f0, f0, f2
+; CHECK-P9-NEXT:    xsadddp f1, f1, f2
+; CHECK-P9-NEXT:    stfd f0, 8(r3)
+; CHECK-P9-NEXT:    stfd f1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-P8-LABEL: st_reversed_double_from_i8:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lbz r4, 0(r3)
+; CHECK-P8-NEXT:    lbz r5, 8(r3)
+; CHECK-P8-NEXT:    mtfprwz f0, r4
+; CHECK-P8-NEXT:    mtfprwz f1, r5
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P8-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P8-NEXT:    xscvuxddp f0, f0
+; CHECK-P8-NEXT:    xscvuxddp f1, f1
+; CHECK-P8-NEXT:    xsadddp f0, f0, f2
+; CHECK-P8-NEXT:    xsadddp f1, f1, f2
+; CHECK-P8-NEXT:    stfd f1, 0(r3)
+; CHECK-P8-NEXT:    stfd f0, 8(r3)
+; CHECK-P8-NEXT:    blr
+entry:
+  %idx = getelementptr inbounds i8, ptr %ptr, i64 8
+  %i0 = load i8, ptr %ptr, align 1
+  %i1 = load i8, ptr %idx, align 1
+  %f0 = uitofp i8 %i0 to double
+  %f1 = uitofp i8 %i1 to double
+  %a0 = fadd double %f0, -1.280000e+02
+  %a1 = fadd double %f1, -1.280000e+02
+  store double %a1, ptr %ptr, align 8
+  store double %a0, ptr %idx, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll b/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll
index 824dd4c4db6cb7..f3960573421298 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll
@@ -7271,3 +7271,61 @@ entry:
   store double %conv, ptr inttoptr (i64 1000000000000 to ptr), align 4096
   ret void
 }
+
+define dso_local void @st_reversed_float_from_i8(ptr %ptr) {
+; CHECK-P10-LABEL: st_reversed_float_from_i8:
+; CHECK-P10:       # %bb.0: # %entry
+; CHECK-P10-NEXT:    li r4, 8
+; CHECK-P10-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P10-NEXT:    xxspltidp vs2, -1023410176
+; CHECK-P10-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P10-NEXT:    xscvuxdsp f0, f0
+; CHECK-P10-NEXT:    xscvuxdsp f1, f1
+; CHECK-P10-NEXT:    xsaddsp f0, f0, f2
+; CHECK-P10-NEXT:    xsaddsp f1, f1, f2
+; CHECK-P10-NEXT:    stfs f0, 8(r3)
+; CHECK-P10-NEXT:    stfs f1, 0(r3)
+; CHECK-P10-NEXT:    blr
+;
+; CHECK-P9-LABEL: st_reversed_float_from_i8:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    li r4, 8
+; CHECK-P9-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P9-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P9-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xsaddsp f0, f0, f2
+; CHECK-P9-NEXT:    xsaddsp f1, f1, f2
+; CHECK-P9-NEXT:    stfs f0, 8(r3)
+; CHECK-P9-NEXT:    stfs f1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-P8-LABEL: st_reversed_float_from_i8:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lbz r4, 0(r3)
+; CHECK-P8-NEXT:    lbz r5, 8(r3)
+; CHECK-P8-NEXT:    mtfprwz f0, r4
+; CHECK-P8-NEXT:    mtfprwz f1, r5
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P8-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xsaddsp f0, f0, f2
+; CHECK-P8-NEXT:    xsaddsp f1, f1, f2
+; CHECK-P8-NEXT:    stfs f1, 0(r3)
+; CHECK-P8-NEXT:    stfs f0, 8(r3)
+; CHECK-P8-NEXT:    blr
+entry:
+  %idx = getelementptr inbounds i8, ptr %ptr, i64 8
+  %i0 = load i8, ptr %ptr, align 1
+  %i1 = load i8, ptr %idx, align 1
+  %f0 = uitofp i8 %i0 to float
+  %f1 = uitofp i8 %i1 to float
+  %a0 = fadd float %f0, -1.280000e+02
+  %a1 = fadd float %f1, -1.280000e+02
+  store float %a1, ptr %ptr, align 8
+  store float %a0, ptr %idx, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index b90bef7525379d..ecd14eaffcb5d5 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -84,7 +84,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+zvksh %s -o - | FileCheck --check-prefix=RV32ZVKSH %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+zvkt %s -o - | FileCheck --check-prefix=RV32ZVKT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zvfh %s -o - | FileCheck --check-prefix=RV32ZVFH %s
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicond %s -o - | FileCheck --check-prefix=RV32ZICOND %s
+; RUN: llc -mtriple=riscv32 -mattr=+zicond %s -o - | FileCheck --check-prefix=RV32ZICOND %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zimop %s -o - | FileCheck --check-prefix=RV32ZIMOP %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zcmop %s -o - | FileCheck --check-prefix=RV32ZCMOP %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SMAIA %s
@@ -186,7 +186,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+zvksh %s -o - | FileCheck --check-prefix=RV64ZVKSH %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+zvkt %s -o - | FileCheck --check-prefix=RV64ZVKT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zvfh %s -o - | FileCheck --check-prefix=RV64ZVFH %s
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicond %s -o - | FileCheck --check-prefix=RV64ZICOND %s
+; RUN: llc -mtriple=riscv64 -mattr=+zicond %s -o - | FileCheck --check-prefix=RV64ZICOND %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zimop %s -o - | FileCheck --check-prefix=RV64ZIMOP %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zcmop %s -o - | FileCheck --check-prefix=RV64ZCMOP %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SMAIA %s
diff --git a/llvm/test/CodeGen/RISCV/branch-opt.mir b/llvm/test/CodeGen/RISCV/branch-opt.mir
new file mode 100644
index 00000000000000..ba3a20f2fbfcd3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/branch-opt.mir
@@ -0,0 +1,68 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc %s -mtriple=riscv64 -run-pass=peephole-opt -o - | FileCheck %s
+
+# Make sure we shouldn't replace the %2 ADDI with the $x10 ADDI since it has a
+# physical register destination.
+
+--- |
+  define void @foo(i32 signext %0) {
+    tail call void @bar(i32 1)
+    %2 = icmp ugt i32 %0, 1
+    br i1 %2, label %3, label %4
+
+  3:                                                ; preds = %1
+    tail call void @bar(i32 3)
+    ret void
+
+  4:                                                ; preds = %1
+    ret void
+  }
+
+  declare void @bar(...)
+
+...
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: foo
+  ; CHECK: bb.0 (%ir-block.1):
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x10
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $x2, implicit $x2
+  ; CHECK-NEXT:   $x10 = ADDI $x0, 1
+  ; CHECK-NEXT:   PseudoCALL target-flags(riscv-call) @bar, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit-def $x2
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $x2, implicit $x2
+  ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 2
+  ; CHECK-NEXT:   BLTU [[COPY]], killed [[ADDI]], %bb.2
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.3):
+  ; CHECK-NEXT:   $x10 = ADDI $x0, 3
+  ; CHECK-NEXT:   PseudoTAIL target-flags(riscv-call) @bar, implicit $x2, implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2 (%ir-block.4):
+  ; CHECK-NEXT:   PseudoRET
+  bb.0 (%ir-block.1):
+    successors: %bb.1, %bb.2
+    liveins: $x10
+
+    %0:gpr = COPY $x10
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $x2, implicit $x2
+    $x10 = ADDI $x0, 1
+    PseudoCALL target-flags(riscv-call) @bar, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit-def $x2
+    ADJCALLSTACKUP 0, 0, implicit-def dead $x2, implicit $x2
+    %2:gpr = ADDI $x0, 2
+    BLTU %0, killed %2, %bb.2
+    PseudoBR %bb.1
+
+  bb.1 (%ir-block.3):
+    $x10 = ADDI $x0, 3
+    PseudoTAIL target-flags(riscv-call) @bar, implicit $x2, implicit $x10
+
+  bb.2 (%ir-block.4):
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
index 6ad529ea477c1a..b26bd7b889807a 100644
--- a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
@@ -3,13 +3,13 @@
 ; RUN:   | FileCheck -check-prefix=NOCMOV %s
 ; RUN: llc -mtriple=riscv64 -mattr=+conditional-cmv-fusion,+c -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CMOV,CMOV-NOZICOND %s
-; RUN: llc -mtriple=riscv64 -mattr=+conditional-cmv-fusion,+c,+experimental-zicond -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+conditional-cmv-fusion,+c,+zicond -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CMOV,CMOV-ZICOND %s
 ; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=SHORT_FORWARD,SFB-NOZICOND %s
 ; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt,+c -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=SHORT_FORWARD,SFB-NOZICOND %s
-; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt,+experimental-zicond -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt,+zicond -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=SHORT_FORWARD,SFB-ZICOND %s
 
 ; The conditional move optimization in sifive-p450 requires that only a
diff --git a/llvm/test/CodeGen/RISCV/condbinops.ll b/llvm/test/CodeGen/RISCV/condbinops.ll
index 87ebefcb65c7de..1a661fddacfa05 100644
--- a/llvm/test/CodeGen/RISCV/condbinops.ll
+++ b/llvm/test/CodeGen/RISCV/condbinops.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -mtriple=riscv64 < %s | FileCheck %s -check-prefix=RV64I
 ; RUN: llc -mtriple=riscv64 -mattr=+xventanacondops < %s | FileCheck %s -check-prefix=RV64XVENTANACONDOPS
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicond < %s | FileCheck %s -check-prefix=RV64ZICOND
+; RUN: llc -mtriple=riscv32 -mattr=+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
+; RUN: llc -mtriple=riscv64 -mattr=+zicond < %s | FileCheck %s -check-prefix=RV64ZICOND
 
 define i32 @shl32(i32 %x, i32 %y, i1 %c) {
 ; RV32I-LABEL: shl32:
diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll
index 23f219c2487ebc..101cb5aeeb0940 100644
--- a/llvm/test/CodeGen/RISCV/condops.ll
+++ b/llvm/test/CodeGen/RISCV/condops.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs < %s | FileCheck %s -check-prefix=RV64I
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV64XVENTANACONDOPS
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+experimental-zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
-; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+experimental-zicond < %s | FileCheck %s -check-prefix=RV64ZICOND
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
+; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV64ZICOND
 
 define i64 @zero1(i64 %rs1, i1 zeroext %rc) {
 ; RV32I-LABEL: zero1:
@@ -3719,3 +3719,54 @@ entry:
   %cond = select i1 %tobool.not, i64 0, i64 %x
   ret i64 %cond
 }
+
+; Test that we don't crash on types larger than 64 bits.
+define i64 @single_bit3(i80 %x, i64 %y) {
+; RV32I-LABEL: single_bit3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lw a0, 8(a0)
+; RV32I-NEXT:    slli a0, a0, 31
+; RV32I-NEXT:    srai a3, a0, 31
+; RV32I-NEXT:    and a0, a3, a1
+; RV32I-NEXT:    and a1, a3, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: single_bit3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a1, a1, 63
+; RV64I-NEXT:    srai a0, a1, 63
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64XVENTANACONDOPS-LABEL: single_bit3:
+; RV64XVENTANACONDOPS:       # %bb.0: # %entry
+; RV64XVENTANACONDOPS-NEXT:    andi a1, a1, 1
+; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV64XVENTANACONDOPS-NEXT:    ret
+;
+; RV64XTHEADCONDMOV-LABEL: single_bit3:
+; RV64XTHEADCONDMOV:       # %bb.0: # %entry
+; RV64XTHEADCONDMOV-NEXT:    slli a1, a1, 63
+; RV64XTHEADCONDMOV-NEXT:    srai a0, a1, 63
+; RV64XTHEADCONDMOV-NEXT:    and a0, a0, a2
+; RV64XTHEADCONDMOV-NEXT:    ret
+;
+; RV32ZICOND-LABEL: single_bit3:
+; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND-NEXT:    lw a0, 8(a0)
+; RV32ZICOND-NEXT:    andi a3, a0, 1
+; RV32ZICOND-NEXT:    czero.eqz a0, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a1, a2, a3
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: single_bit3:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    czero.eqz a0, a2, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %and = and i80 %x, 18446744073709551616 ; 1 << 64
+  %tobool.not = icmp eq i80 %and, 0
+  %cond = select i1 %tobool.not, i64 0, i64 %y
+  ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/RISCV/pr90652.ll b/llvm/test/CodeGen/RISCV/pr90652.ll
new file mode 100644
index 00000000000000..2162395b92ac3c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr90652.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s
+
+define i1 @test(i64 %x, i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi a3, a0, 1
+; CHECK-NEXT:    slt a0, a3, a0
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    ret
+entry:
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %x, i64 1)
+  %ov = extractvalue { i64, i1 } %sadd, 1
+  %or = or i1 %cond2, %ov
+  %sel = select i1 %cond1, i1 %cond2, i1 %or
+  ret i1 %sel
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index dab530751ef96b..799aebcaa63026 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -238,26 +238,39 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
 define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ; V128-LABEL: interleave_v32f32:
 ; V128:       # %bb.0:
-; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v0, v8, 16
-; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v24, v0, v8
-; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v24, a0, v8
-; V128-NEXT:    lui a1, %hi(.LCPI10_0)
-; V128-NEXT:    addi a1, a1, %lo(.LCPI10_0)
-; V128-NEXT:    li a2, 32
-; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; V128-NEXT:    vle16.v v12, (a1)
-; V128-NEXT:    lui a1, 699051
-; V128-NEXT:    addi a1, a1, -1366
-; V128-NEXT:    vmv.s.x v0, a1
+; V128-NEXT:    addi sp, sp, -16
+; V128-NEXT:    .cfi_def_cfa_offset 16
+; V128-NEXT:    csrr a0, vlenb
+; V128-NEXT:    slli a0, a0, 2
+; V128-NEXT:    sub sp, sp, a0
+; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; V128-NEXT:    lui a0, %hi(.LCPI10_0)
+; V128-NEXT:    addi a0, a0, %lo(.LCPI10_0)
+; V128-NEXT:    li a1, 32
+; V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; V128-NEXT:    vle16.v v4, (a0)
+; V128-NEXT:    lui a0, %hi(.LCPI10_1)
+; V128-NEXT:    addi a0, a0, %lo(.LCPI10_1)
+; V128-NEXT:    vle16.v v24, (a0)
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs4r.v v24, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    lui a0, 699051
+; V128-NEXT:    addi a0, a0, -1366
+; V128-NEXT:    vmv.s.x v0, a0
+; V128-NEXT:    vrgatherei16.vv v24, v8, v4
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vl4r.v v12, (a0) # Unknown-size Folded Reload
 ; V128-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v0, v8, v16
+; V128-NEXT:    li a0, -1
 ; V128-NEXT:    vwmaccu.vx v0, a0, v16
 ; V128-NEXT:    vmv8r.v v8, v0
 ; V128-NEXT:    vmv8r.v v16, v24
+; V128-NEXT:    csrr a0, vlenb
+; V128-NEXT:    slli a0, a0, 2
+; V128-NEXT:    add sp, sp, a0
+; V128-NEXT:    addi sp, sp, 16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v32f32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 9e21cc9e3d624a..e1bd16649eede7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -188,30 +188,24 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V128-LABEL: interleave_v4i32_offset_1:
 ; V128:       # %bb.0:
-; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; V128-NEXT:    vwaddu.vv v10, v8, v8
-; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v10, a0, v8
 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; V128-NEXT:    vid.v v8
-; V128-NEXT:    vsrl.vi v8, v8, 1
+; V128-NEXT:    vid.v v10
+; V128-NEXT:    vsrl.vi v11, v10, 1
+; V128-NEXT:    vrgather.vv v10, v8, v11
 ; V128-NEXT:    vmv.v.i v0, 10
-; V128-NEXT:    vadd.vi v8, v8, 1
+; V128-NEXT:    vadd.vi v8, v11, 1
 ; V128-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; V128-NEXT:    vmv.v.v v8, v10
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v4i32_offset_1:
 ; V512:       # %bb.0:
-; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; V512-NEXT:    vwaddu.vv v10, v8, v8
-; V512-NEXT:    li a0, -1
-; V512-NEXT:    vwmaccu.vx v10, a0, v8
 ; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, mu
-; V512-NEXT:    vid.v v8
-; V512-NEXT:    vsrl.vi v8, v8, 1
+; V512-NEXT:    vid.v v10
+; V512-NEXT:    vsrl.vi v11, v10, 1
+; V512-NEXT:    vrgather.vv v10, v8, v11
 ; V512-NEXT:    vmv.v.i v0, 10
-; V512-NEXT:    vadd.vi v8, v8, 1
+; V512-NEXT:    vadd.vi v8, v11, 1
 ; V512-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; V512-NEXT:    vmv1r.v v8, v10
 ; V512-NEXT:    ret
@@ -403,26 +397,39 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
 define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ; V128-LABEL: interleave_v32i32:
 ; V128:       # %bb.0:
-; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v0, v8, 16
-; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v24, v0, v8
-; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v24, a0, v8
-; V128-NEXT:    lui a1, %hi(.LCPI17_0)
-; V128-NEXT:    addi a1, a1, %lo(.LCPI17_0)
-; V128-NEXT:    li a2, 32
-; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; V128-NEXT:    vle16.v v12, (a1)
-; V128-NEXT:    lui a1, 699051
-; V128-NEXT:    addi a1, a1, -1366
-; V128-NEXT:    vmv.s.x v0, a1
+; V128-NEXT:    addi sp, sp, -16
+; V128-NEXT:    .cfi_def_cfa_offset 16
+; V128-NEXT:    csrr a0, vlenb
+; V128-NEXT:    slli a0, a0, 2
+; V128-NEXT:    sub sp, sp, a0
+; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; V128-NEXT:    lui a0, %hi(.LCPI17_0)
+; V128-NEXT:    addi a0, a0, %lo(.LCPI17_0)
+; V128-NEXT:    li a1, 32
+; V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; V128-NEXT:    vle16.v v4, (a0)
+; V128-NEXT:    lui a0, %hi(.LCPI17_1)
+; V128-NEXT:    addi a0, a0, %lo(.LCPI17_1)
+; V128-NEXT:    vle16.v v24, (a0)
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs4r.v v24, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    lui a0, 699051
+; V128-NEXT:    addi a0, a0, -1366
+; V128-NEXT:    vmv.s.x v0, a0
+; V128-NEXT:    vrgatherei16.vv v24, v8, v4
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vl4r.v v12, (a0) # Unknown-size Folded Reload
 ; V128-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v0, v8, v16
+; V128-NEXT:    li a0, -1
 ; V128-NEXT:    vwmaccu.vx v0, a0, v16
 ; V128-NEXT:    vmv8r.v v8, v0
 ; V128-NEXT:    vmv8r.v v16, v24
+; V128-NEXT:    csrr a0, vlenb
+; V128-NEXT:    slli a0, a0, 2
+; V128-NEXT:    add sp, sp, a0
+; V128-NEXT:    addi sp, sp, 16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v32i32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index a26a87a1f3c139..a56a81f5f793bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -612,11 +612,13 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: concat_4xi8_start_undef_at_start:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 224
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -4
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -4
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
   ret <8 x i8> %res
@@ -626,11 +628,13 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_start_into_end_non_contiguous:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 144
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -4
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -4
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
   ret <8 x i8> %res
@@ -671,11 +675,13 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_slidedown:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    li a0, 195
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
   ret <8 x i8> %res
@@ -686,12 +692,14 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
 ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 2
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    li a0, 234
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -1
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
   ret <8 x i8> %res
@@ -702,13 +710,16 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: unmergable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vadd.vi v11, v10, 2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; CHECK-NEXT:    vle8.v v10, (a0)
+; CHECK-NEXT:    vle8.v v12, (a0)
 ; CHECK-NEXT:    li a0, 234
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vrgather.vv v10, v9, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
   ret <8 x i8> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index f889041647b235..eeb8e517d01d2d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -8,51 +8,23 @@
 
 ; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3
 define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) {
-; RV32-LABEL: load_factor2_v3:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; RV32-NEXT:    vle32.v v10, (a0)
-; RV32-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v10, 2
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vwaddu.vv v8, v10, v9
-; RV32-NEXT:    li a0, -1
-; RV32-NEXT:    vwmaccu.vx v8, a0, v9
-; RV32-NEXT:    vmv.v.i v0, 4
-; RV32-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v10, 4
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; RV32-NEXT:    vrgather.vi v8, v12, 0, v0.t
-; RV32-NEXT:    vid.v v9
-; RV32-NEXT:    vadd.vv v9, v9, v9
-; RV32-NEXT:    vadd.vi v11, v9, 1
-; RV32-NEXT:    vrgather.vv v9, v10, v11
-; RV32-NEXT:    vrgather.vi v9, v12, 1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: load_factor2_v3:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; RV64-NEXT:    vle32.v v10, (a0)
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vid.v v8
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vadd.vi v8, v8, 1
-; RV64-NEXT:    vrgather.vv v9, v10, v8
-; RV64-NEXT:    vmv.v.i v0, 4
-; RV64-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v10, 4
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; RV64-NEXT:    vrgather.vi v9, v12, 1, v0.t
-; RV64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v11, v10, 2
-; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT:    vwaddu.vv v8, v10, v11
-; RV64-NEXT:    li a0, -1
-; RV64-NEXT:    vwmaccu.vx v8, a0, v11
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v12, 0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: load_factor2_v3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vadd.vv v9, v8, v8
+; CHECK-NEXT:    vrgather.vv v8, v10, v9
+; CHECK-NEXT:    vmv.v.i v0, 4
+; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v12, v10, 4
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vrgather.vi v8, v12, 0, v0.t
+; CHECK-NEXT:    vadd.vi v11, v9, 1
+; CHECK-NEXT:    vrgather.vv v9, v10, v11
+; CHECK-NEXT:    vrgather.vi v9, v12, 1, v0.t
+; CHECK-NEXT:    ret
   %interleaved.vec = load <6 x i32>, ptr %ptr
   %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 0, i32 2, i32 4>
   %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 1, i32 3, i32 5>
@@ -159,142 +131,163 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 58
+; RV32-NEXT:    li a3, 62
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 58 * vlenb
-; RV32-NEXT:    addi a3, a1, 256
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 62 * vlenb
+; RV32-NEXT:    addi a3, a1, 128
+; RV32-NEXT:    addi a4, a1, 256
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a3)
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 25
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, a1, 128
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vslideup.vi v16, v8, 4
+; RV32-NEXT:    vle32.v v16, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 12
+; RV32-NEXT:    li a5, 29
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vid.v v20
-; RV32-NEXT:    vadd.vi v4, v20, -10
-; RV32-NEXT:    vmv.v.v v2, v20
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT:    vid.v v10
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 4
+; RV32-NEXT:    slli a5, a4, 3
 ; RV32-NEXT:    add a4, a5, a4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs2r.v v20, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs2r.v v10, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vadd.vi v8, v10, -4
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 13
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v12, v16, v8
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 21
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs4r.v v12, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vadd.vi v8, v10, -10
 ; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    vmv.s.x v1, a4
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 16
+; RV32-NEXT:    vmv.s.x v0, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 5
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    li a5, 45
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs1r.v v1, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v4, v0.t
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 21
+; RV32-NEXT:    li a5, 25
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, %hi(.LCPI6_0)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
+; RV32-NEXT:    lui a5, %hi(.LCPI6_1)
+; RV32-NEXT:    addi a5, a5, %lo(.LCPI6_1)
+; RV32-NEXT:    lui a6, 1
 ; RV32-NEXT:    vle16.v v8, (a4)
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v8, (a5)
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 2
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, %hi(.LCPI6_1)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_1)
-; RV32-NEXT:    lui a5, 1
-; RV32-NEXT:    vle16.v v8, (a4)
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v16, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 49
+; RV32-NEXT:    li a4, 37
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, a5, -64
+; RV32-NEXT:    addi a1, a6, -64
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a3, a1, 4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv.v.v v12, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    slli a3, a1, 3
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl2r.v v10, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vi v8, v10, -2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v8, 2
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v12, v16, v8
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vadd.vi v8, v2, -8
+; RV32-NEXT:    vadd.vi v8, v10, -8
+; RV32-NEXT:    vmv2r.v v30, v10
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl1r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v28
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV32-NEXT:    vmv.v.v v20, v12
+; RV32-NEXT:    vmv.v.v v24, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
@@ -308,165 +301,166 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a3, a1, 4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v8
+; RV32-NEXT:    vmv.v.v v24, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a3, a1, 4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8
+; RV32-NEXT:    vrgatherei16.vv v4, v16, v8
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vadd.vi v8, v30, -6
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vi v8, v8, -6
+; RV32-NEXT:    vs2r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v0, v28
+; RV32-NEXT:    vmv1r.v v2, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vrgatherei16.vv v4, v16, v8, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
-; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vle16.v v28, (a3)
+; RV32-NEXT:    vle16.v v20, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a3, a1, 3
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs1r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.s.x v1, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
-; RV32-NEXT:    vmv1r.v v0, v20
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v8, v20
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a3, a1, 3
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v8
+; RV32-NEXT:    vmv.v.v v4, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a3, a1, 3
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v8
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vi v8, v8, -4
-; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV32-NEXT:    vmv.v.v v24, v12
+; RV32-NEXT:    vmv.v.v v4, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_8)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_8)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_9)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_9)
 ; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vle16.v v28, (a3)
+; RV32-NEXT:    vle16.v v20, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v8
+; RV32-NEXT:    vmv.v.v v4, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
@@ -474,20 +468,25 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v20, v16, 6
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl2r.v v10, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v20, v16, v10
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -502,13 +501,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a1, 1008
 ; RV32-NEXT:    vmv.s.x v28, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v28, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -516,7 +515,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
 ; RV32-NEXT:    vmv1r.v v0, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -529,19 +528,19 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    li a3, 21
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -554,33 +553,33 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vle16.v v24, (a1)
 ; RV32-NEXT:    vle16.v v8, (a2)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 5
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a2, 45
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 49
+; RV32-NEXT:    li a2, 37
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 25
+; RV32-NEXT:    li a2, 29
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 41
+; RV32-NEXT:    li a2, 53
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 5
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a2, 45
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
@@ -594,35 +593,37 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 192
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a3, a2, 4
-; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    slli a3, a2, 3
+; RV32-NEXT:    add a2, a3, a2
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    slli a3, a2, 4
+; RV32-NEXT:    add a2, a3, a2
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 21
+; RV32-NEXT:    li a2, 25
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 58
+; RV32-NEXT:    li a1, 62
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index df41ac10f80d36..88c299a19fb4e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -14638,5 +14638,496 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %allones, <8 x i16> poison)
   ret <8 x i16> %v
 }
+
+; v32i64 is not a legal type, so make sure we don't try to combine the mgather
+; to a vlse intrinsic until it is legalized and split.
+define <32 x i64> @mgather_strided_split(ptr %base) {
+; RV32V-LABEL: mgather_strided_split:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    li a1, 16
+; RV32V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32V-NEXT:    vlse64.v v8, (a0), a1
+; RV32V-NEXT:    addi a0, a0, 256
+; RV32V-NEXT:    vlse64.v v16, (a0), a1
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: mgather_strided_split:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    li a1, 16
+; RV64V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64V-NEXT:    vlse64.v v8, (a0), a1
+; RV64V-NEXT:    addi a0, a0, 256
+; RV64V-NEXT:    vlse64.v v16, (a0), a1
+; RV64V-NEXT:    ret
+;
+; RV32ZVE32F-LABEL: mgather_strided_split:
+; RV32ZVE32F:       # %bb.0:
+; RV32ZVE32F-NEXT:    addi sp, sp, -512
+; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 512
+; RV32ZVE32F-NEXT:    sw ra, 508(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s0, 504(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s2, 500(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s3, 496(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s4, 492(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s5, 488(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s6, 484(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s7, 480(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s8, 476(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s9, 472(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s10, 468(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s11, 464(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    .cfi_offset ra, -4
+; RV32ZVE32F-NEXT:    .cfi_offset s0, -8
+; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
+; RV32ZVE32F-NEXT:    .cfi_offset s3, -16
+; RV32ZVE32F-NEXT:    .cfi_offset s4, -20
+; RV32ZVE32F-NEXT:    .cfi_offset s5, -24
+; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
+; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
+; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
+; RV32ZVE32F-NEXT:    .cfi_offset s9, -40
+; RV32ZVE32F-NEXT:    .cfi_offset s10, -44
+; RV32ZVE32F-NEXT:    .cfi_offset s11, -48
+; RV32ZVE32F-NEXT:    addi s0, sp, 512
+; RV32ZVE32F-NEXT:    .cfi_def_cfa s0, 0
+; RV32ZVE32F-NEXT:    andi sp, sp, -128
+; RV32ZVE32F-NEXT:    li a2, 32
+; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32ZVE32F-NEXT:    vid.v v8
+; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 216(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 208(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 252(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 248(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 244(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 236(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 228(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 220(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 240(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 232(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 224(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 212(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 204(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 200(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 7
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 196(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 192(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi a1, sp, 256
+; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
+; RV32ZVE32F-NEXT:    lw a1, 288(sp)
+; RV32ZVE32F-NEXT:    lw a2, 292(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 188(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 184(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 296(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 180(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 176(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 300(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 172(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 168(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 304(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 164(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 160(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 308(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 156(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 152(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 312(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 148(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 144(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 316(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 140(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 136(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 320(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 132(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 128(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 324(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 124(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 120(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 328(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 116(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 332(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 104(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw ra, 4(a1)
+; RV32ZVE32F-NEXT:    lw a1, 336(sp)
+; RV32ZVE32F-NEXT:    lw s10, 0(a2)
+; RV32ZVE32F-NEXT:    lw s8, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 340(sp)
+; RV32ZVE32F-NEXT:    lw s6, 0(a1)
+; RV32ZVE32F-NEXT:    lw s4, 4(a1)
+; RV32ZVE32F-NEXT:    lw a4, 344(sp)
+; RV32ZVE32F-NEXT:    lw s2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t5, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 348(sp)
+; RV32ZVE32F-NEXT:    lw t3, 0(a4)
+; RV32ZVE32F-NEXT:    lw t2, 4(a4)
+; RV32ZVE32F-NEXT:    lw a4, 352(sp)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a7, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 356(sp)
+; RV32ZVE32F-NEXT:    lw a6, 0(a4)
+; RV32ZVE32F-NEXT:    lw a5, 4(a4)
+; RV32ZVE32F-NEXT:    lw a4, 360(sp)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    sw a1, 108(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 100(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 364(sp)
+; RV32ZVE32F-NEXT:    lw s11, 0(a4)
+; RV32ZVE32F-NEXT:    lw s9, 4(a4)
+; RV32ZVE32F-NEXT:    lw a1, 368(sp)
+; RV32ZVE32F-NEXT:    lw s7, 0(a2)
+; RV32ZVE32F-NEXT:    lw s5, 4(a2)
+; RV32ZVE32F-NEXT:    lw a3, 372(sp)
+; RV32ZVE32F-NEXT:    lw s3, 0(a1)
+; RV32ZVE32F-NEXT:    lw t6, 4(a1)
+; RV32ZVE32F-NEXT:    lw a2, 376(sp)
+; RV32ZVE32F-NEXT:    lw t4, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 380(sp)
+; RV32ZVE32F-NEXT:    lw t1, 4(a3)
+; RV32ZVE32F-NEXT:    lw a4, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a5, 196(a0)
+; RV32ZVE32F-NEXT:    sw a6, 192(a0)
+; RV32ZVE32F-NEXT:    sw a7, 188(a0)
+; RV32ZVE32F-NEXT:    sw t0, 184(a0)
+; RV32ZVE32F-NEXT:    sw t2, 180(a0)
+; RV32ZVE32F-NEXT:    sw t3, 176(a0)
+; RV32ZVE32F-NEXT:    sw t5, 172(a0)
+; RV32ZVE32F-NEXT:    sw s2, 168(a0)
+; RV32ZVE32F-NEXT:    sw s4, 164(a0)
+; RV32ZVE32F-NEXT:    sw s6, 160(a0)
+; RV32ZVE32F-NEXT:    sw s8, 156(a0)
+; RV32ZVE32F-NEXT:    sw s10, 152(a0)
+; RV32ZVE32F-NEXT:    sw ra, 148(a0)
+; RV32ZVE32F-NEXT:    lw a5, 104(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 144(a0)
+; RV32ZVE32F-NEXT:    lw a5, 112(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 140(a0)
+; RV32ZVE32F-NEXT:    lw a5, 116(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 136(a0)
+; RV32ZVE32F-NEXT:    lw a5, 120(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 132(a0)
+; RV32ZVE32F-NEXT:    lw a5, 124(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 128(a0)
+; RV32ZVE32F-NEXT:    lw a5, 128(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 124(a0)
+; RV32ZVE32F-NEXT:    lw a5, 132(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 120(a0)
+; RV32ZVE32F-NEXT:    lw a5, 136(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 116(a0)
+; RV32ZVE32F-NEXT:    lw a5, 140(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 112(a0)
+; RV32ZVE32F-NEXT:    lw a5, 144(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 108(a0)
+; RV32ZVE32F-NEXT:    lw a5, 148(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 104(a0)
+; RV32ZVE32F-NEXT:    lw a5, 152(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 100(a0)
+; RV32ZVE32F-NEXT:    lw a5, 156(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 96(a0)
+; RV32ZVE32F-NEXT:    lw a5, 160(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 92(a0)
+; RV32ZVE32F-NEXT:    lw a5, 164(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 88(a0)
+; RV32ZVE32F-NEXT:    lw a5, 168(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 84(a0)
+; RV32ZVE32F-NEXT:    lw a5, 172(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 80(a0)
+; RV32ZVE32F-NEXT:    lw a5, 176(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 76(a0)
+; RV32ZVE32F-NEXT:    lw a5, 180(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 72(a0)
+; RV32ZVE32F-NEXT:    lw a5, 184(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 68(a0)
+; RV32ZVE32F-NEXT:    lw a5, 188(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 64(a0)
+; RV32ZVE32F-NEXT:    lw a5, 208(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
+; RV32ZVE32F-NEXT:    lw a5, 216(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
+; RV32ZVE32F-NEXT:    sw a1, 252(a0)
+; RV32ZVE32F-NEXT:    sw a2, 248(a0)
+; RV32ZVE32F-NEXT:    sw a3, 244(a0)
+; RV32ZVE32F-NEXT:    sw a4, 240(a0)
+; RV32ZVE32F-NEXT:    sw t1, 236(a0)
+; RV32ZVE32F-NEXT:    sw t4, 232(a0)
+; RV32ZVE32F-NEXT:    sw t6, 228(a0)
+; RV32ZVE32F-NEXT:    sw s3, 224(a0)
+; RV32ZVE32F-NEXT:    sw s5, 220(a0)
+; RV32ZVE32F-NEXT:    sw s7, 216(a0)
+; RV32ZVE32F-NEXT:    sw s9, 212(a0)
+; RV32ZVE32F-NEXT:    sw s11, 208(a0)
+; RV32ZVE32F-NEXT:    lw a1, 100(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 204(a0)
+; RV32ZVE32F-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 200(a0)
+; RV32ZVE32F-NEXT:    lw a1, 220(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 28(a0)
+; RV32ZVE32F-NEXT:    lw a1, 228(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 24(a0)
+; RV32ZVE32F-NEXT:    lw a1, 236(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 20(a0)
+; RV32ZVE32F-NEXT:    lw a1, 244(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 16(a0)
+; RV32ZVE32F-NEXT:    lw a1, 248(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a1, 252(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 8(a0)
+; RV32ZVE32F-NEXT:    lw a1, 192(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 60(a0)
+; RV32ZVE32F-NEXT:    lw a1, 196(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 56(a0)
+; RV32ZVE32F-NEXT:    lw a1, 200(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 52(a0)
+; RV32ZVE32F-NEXT:    lw a1, 204(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 48(a0)
+; RV32ZVE32F-NEXT:    lw a1, 212(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 44(a0)
+; RV32ZVE32F-NEXT:    lw a1, 224(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 40(a0)
+; RV32ZVE32F-NEXT:    lw a1, 232(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a1, 240(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 32(a0)
+; RV32ZVE32F-NEXT:    addi sp, s0, -512
+; RV32ZVE32F-NEXT:    lw ra, 508(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s0, 504(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s2, 500(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s3, 496(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s4, 492(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s5, 488(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s6, 484(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s7, 480(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s8, 476(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s9, 472(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s10, 468(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s11, 464(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    addi sp, sp, 512
+; RV32ZVE32F-NEXT:    ret
+;
+; RV64ZVE32F-LABEL: mgather_strided_split:
+; RV64ZVE32F:       # %bb.0:
+; RV64ZVE32F-NEXT:    addi sp, sp, -144
+; RV64ZVE32F-NEXT:    .cfi_def_cfa_offset 144
+; RV64ZVE32F-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    .cfi_offset ra, -8
+; RV64ZVE32F-NEXT:    .cfi_offset s0, -16
+; RV64ZVE32F-NEXT:    .cfi_offset s1, -24
+; RV64ZVE32F-NEXT:    .cfi_offset s2, -32
+; RV64ZVE32F-NEXT:    .cfi_offset s3, -40
+; RV64ZVE32F-NEXT:    .cfi_offset s4, -48
+; RV64ZVE32F-NEXT:    .cfi_offset s5, -56
+; RV64ZVE32F-NEXT:    .cfi_offset s6, -64
+; RV64ZVE32F-NEXT:    .cfi_offset s7, -72
+; RV64ZVE32F-NEXT:    .cfi_offset s8, -80
+; RV64ZVE32F-NEXT:    .cfi_offset s9, -88
+; RV64ZVE32F-NEXT:    .cfi_offset s10, -96
+; RV64ZVE32F-NEXT:    .cfi_offset s11, -104
+; RV64ZVE32F-NEXT:    ld a2, 0(a1)
+; RV64ZVE32F-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 16(a1)
+; RV64ZVE32F-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 32(a1)
+; RV64ZVE32F-NEXT:    sd a2, 16(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 48(a1)
+; RV64ZVE32F-NEXT:    sd a2, 8(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 64(a1)
+; RV64ZVE32F-NEXT:    sd a2, 0(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a7, 80(a1)
+; RV64ZVE32F-NEXT:    ld t0, 96(a1)
+; RV64ZVE32F-NEXT:    ld t1, 112(a1)
+; RV64ZVE32F-NEXT:    ld t2, 128(a1)
+; RV64ZVE32F-NEXT:    ld t3, 144(a1)
+; RV64ZVE32F-NEXT:    ld t4, 160(a1)
+; RV64ZVE32F-NEXT:    ld t5, 176(a1)
+; RV64ZVE32F-NEXT:    ld t6, 192(a1)
+; RV64ZVE32F-NEXT:    ld s0, 208(a1)
+; RV64ZVE32F-NEXT:    ld s1, 224(a1)
+; RV64ZVE32F-NEXT:    ld s2, 240(a1)
+; RV64ZVE32F-NEXT:    ld s3, 256(a1)
+; RV64ZVE32F-NEXT:    ld s4, 272(a1)
+; RV64ZVE32F-NEXT:    ld s5, 288(a1)
+; RV64ZVE32F-NEXT:    ld s6, 304(a1)
+; RV64ZVE32F-NEXT:    ld s7, 320(a1)
+; RV64ZVE32F-NEXT:    ld s8, 336(a1)
+; RV64ZVE32F-NEXT:    ld s9, 352(a1)
+; RV64ZVE32F-NEXT:    ld s10, 368(a1)
+; RV64ZVE32F-NEXT:    ld s11, 384(a1)
+; RV64ZVE32F-NEXT:    ld ra, 400(a1)
+; RV64ZVE32F-NEXT:    ld a6, 416(a1)
+; RV64ZVE32F-NEXT:    ld a5, 432(a1)
+; RV64ZVE32F-NEXT:    ld a2, 496(a1)
+; RV64ZVE32F-NEXT:    ld a3, 480(a1)
+; RV64ZVE32F-NEXT:    ld a4, 464(a1)
+; RV64ZVE32F-NEXT:    ld a1, 448(a1)
+; RV64ZVE32F-NEXT:    sd a2, 248(a0)
+; RV64ZVE32F-NEXT:    sd a3, 240(a0)
+; RV64ZVE32F-NEXT:    sd a4, 232(a0)
+; RV64ZVE32F-NEXT:    sd a1, 224(a0)
+; RV64ZVE32F-NEXT:    sd a5, 216(a0)
+; RV64ZVE32F-NEXT:    sd a6, 208(a0)
+; RV64ZVE32F-NEXT:    sd ra, 200(a0)
+; RV64ZVE32F-NEXT:    sd s11, 192(a0)
+; RV64ZVE32F-NEXT:    sd s10, 184(a0)
+; RV64ZVE32F-NEXT:    sd s9, 176(a0)
+; RV64ZVE32F-NEXT:    sd s8, 168(a0)
+; RV64ZVE32F-NEXT:    sd s7, 160(a0)
+; RV64ZVE32F-NEXT:    sd s6, 152(a0)
+; RV64ZVE32F-NEXT:    sd s5, 144(a0)
+; RV64ZVE32F-NEXT:    sd s4, 136(a0)
+; RV64ZVE32F-NEXT:    sd s3, 128(a0)
+; RV64ZVE32F-NEXT:    sd s2, 120(a0)
+; RV64ZVE32F-NEXT:    sd s1, 112(a0)
+; RV64ZVE32F-NEXT:    sd s0, 104(a0)
+; RV64ZVE32F-NEXT:    sd t6, 96(a0)
+; RV64ZVE32F-NEXT:    sd t5, 88(a0)
+; RV64ZVE32F-NEXT:    sd t4, 80(a0)
+; RV64ZVE32F-NEXT:    sd t3, 72(a0)
+; RV64ZVE32F-NEXT:    sd t2, 64(a0)
+; RV64ZVE32F-NEXT:    sd t1, 56(a0)
+; RV64ZVE32F-NEXT:    sd t0, 48(a0)
+; RV64ZVE32F-NEXT:    sd a7, 40(a0)
+; RV64ZVE32F-NEXT:    ld a1, 0(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 32(a0)
+; RV64ZVE32F-NEXT:    ld a1, 8(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 24(a0)
+; RV64ZVE32F-NEXT:    ld a1, 16(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 8(a0)
+; RV64ZVE32F-NEXT:    ld a1, 32(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 0(a0)
+; RV64ZVE32F-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    addi sp, sp, 144
+; RV64ZVE32F-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, ptr %base, <32 x i64> <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38, i64 40, i64 42, i64 44, i64 46, i64 48, i64 50, i64 52, i64 54, i64 56, i64 58, i64 60, i64 62>
+  %x = call <32 x i64> @llvm.masked.gather.v32i64.v32p0(<32 x ptr> %ptrs, i32 8, <32 x i1> shufflevector(<32 x i1> insertelement(<32 x i1> poison, i1 true, i32 0), <32 x i1> poison, <32 x i32> zeroinitializer), <32 x i64> poison)
+  ret <32 x i64> %x
+}
+
+define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
+; RV32V-LABEL: masked_gather_widen_sew_negative_stride:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    addi a0, a0, 136
+; RV32V-NEXT:    li a1, -136
+; RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32V-NEXT:    vlse64.v v8, (a0), a1
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: masked_gather_widen_sew_negative_stride:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    addi a0, a0, 136
+; RV64V-NEXT:    li a1, -136
+; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64V-NEXT:    vlse64.v v8, (a0), a1
+; RV64V-NEXT:    ret
+;
+; RV32ZVE32F-LABEL: masked_gather_widen_sew_negative_stride:
+; RV32ZVE32F:       # %bb.0:
+; RV32ZVE32F-NEXT:    lui a1, 16393
+; RV32ZVE32F-NEXT:    addi a1, a1, -888
+; RV32ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32ZVE32F-NEXT:    vmv.s.x v9, a1
+; RV32ZVE32F-NEXT:    vluxei8.v v8, (a0), v9
+; RV32ZVE32F-NEXT:    ret
+;
+; RV64ZVE32F-LABEL: masked_gather_widen_sew_negative_stride:
+; RV64ZVE32F:       # %bb.0:
+; RV64ZVE32F-NEXT:    addi a1, a0, 136
+; RV64ZVE32F-NEXT:    lw a2, 140(a0)
+; RV64ZVE32F-NEXT:    lw a3, 0(a0)
+; RV64ZVE32F-NEXT:    lw a0, 4(a0)
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64ZVE32F-NEXT:    vlse32.v v8, (a1), zero
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
+; RV64ZVE32F-NEXT:    ret
+  %ptrs = getelementptr i32, ptr %base, <4 x i64> <i64 34, i64 35, i64 0, i64 1>
+  %x = call <4 x i32> @llvm.masked.gather.v4i32.v32p0(<4 x ptr> %ptrs, i32 8, <4 x i1> shufflevector(<4 x i1> insertelement(<4 x i1> poison, i1 true, i32 0), <4 x i1> poison, <4 x i32> zeroinitializer), <4 x i32> poison)
+  ret <4 x i32> %x
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index f53b51e05c5726..3f0bdb9d5e3166 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -138,8 +138,8 @@ define <4 x i64> @m2_splat_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range
   ret <4 x i64> %res
 }
 
-define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
-; CHECK-LABEL: m2_splat_into_identity_two_source:
+define <4 x i64> @m2_splat_into_identity_two_source_v2_hi(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
+; CHECK-LABEL: m2_splat_into_identity_two_source_v2_hi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 0
@@ -149,6 +149,18 @@ define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2
   ret <4 x i64> %res
 }
 
+define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
+; CHECK-LABEL: m2_splat_into_slide_two_source_v2_lo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v12, v8, 0
+; CHECK-NEXT:    vmv1r.v v13, v10
+; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+  ret <4 x i64> %res
+}
+
 define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_slide_two_source:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
index a34fa9502d93b3..d0777962a75651 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
@@ -8,11 +8,13 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: trn1.v8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -1
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i8> %tmp0
@@ -22,11 +24,13 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: trn2.v8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i8> %tmp0
@@ -36,14 +40,16 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: trn1.v16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   ret <16 x i8> %tmp0
@@ -53,14 +59,16 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: trn2.v16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 1
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   ret <16 x i8> %tmp0
@@ -70,10 +78,12 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: trn1.v4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -1
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i16> %tmp0
@@ -83,10 +93,12 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: trn2.v4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i16> %tmp0
@@ -96,11 +108,13 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: trn1.v8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -1
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i16> %tmp0
@@ -110,11 +124,13 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: trn2.v8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i16> %tmp0
@@ -147,10 +163,12 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: trn1.v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -1
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i32> %tmp0
@@ -160,10 +178,12 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: trn2.v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i32> %tmp0
@@ -219,10 +239,12 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: trn1.v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -1
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x float> %tmp0
@@ -232,10 +254,12 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: trn2.v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x float> %tmp0
@@ -268,10 +292,12 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: trn1.v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -1
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x half> %tmp0
@@ -281,10 +307,12 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: trn2.v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x half> %tmp0
@@ -294,11 +322,13 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: trn1.v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vadd.vi v8, v11, -1
+; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x half> %tmp0
@@ -308,11 +338,13 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: trn2.v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x half> %tmp0
diff --git a/llvm/test/CodeGen/RISCV/select-binop-identity.ll b/llvm/test/CodeGen/RISCV/select-binop-identity.ll
index f45d67164d640d..83bb7f19fa2b05 100644
--- a/llvm/test/CodeGen/RISCV/select-binop-identity.ll
+++ b/llvm/test/CodeGen/RISCV/select-binop-identity.ll
@@ -7,9 +7,9 @@
 ; RUN:   | FileCheck -check-prefix=SFB64 %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xventanacondops -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=VTCONDOPS64 %s
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicond -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+zicond -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32,ZICOND,ZICOND32 %s
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicond -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+zicond -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZICOND,ZICOND64 %s
 
 ; InstCombine canonicalizes (c ? x | y : x) to (x | (c ? y : 0)) similar for
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index 7fa27a307757d0..7dd223df5e557e 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32IM %s
 ; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64IM %s
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+xventanacondops -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64IMXVTCONDOPS %s
-; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV32IMZICOND %s
-; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV64IMZICOND %s
+; RUN: llc -mtriple=riscv32 -mattr=+m,+zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV32IMZICOND %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV64IMZICOND %s
 
 define i16 @select_xor_1(i16 %A, i8 %cond) {
 ; RV32IM-LABEL: select_xor_1:
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
index d2ddbe99000ed0..87406f22d169d1 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
@@ -3,7 +3,7 @@
 ; RUN:   | FileCheck -check-prefix=NOSFB %s
 ; RUN: llc -mtriple=riscv64 -mcpu=sifive-u74 -mattr=+zbb -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=SFB,NOZICOND,RV64SFB %s
-; RUN: llc -mtriple=riscv64 -mcpu=sifive-u74 -mattr=+experimental-zicond,+zbb \
+; RUN: llc -mtriple=riscv64 -mcpu=sifive-u74 -mattr=+zicond,+zbb \
 ; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefixes=SFB,ZICOND %s
 ; RUN: llc -mtriple=riscv32 -mcpu=sifive-e76 -mattr=+zbb -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=SFB,NOZICOND,RV32SFB %s
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index f878d17d5f1da6..ac67c0769f7056 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -3,8 +3,8 @@
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m -verify-machineinstrs | FileCheck %s --check-prefix=RV64
 ; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+zba -verify-machineinstrs | FileCheck %s --check-prefix=RV32ZBA
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba -verify-machineinstrs | FileCheck %s --check-prefix=RV64ZBA
-; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+experimental-zicond -verify-machineinstrs | FileCheck %s --check-prefix=RV32ZICOND
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+experimental-zicond -verify-machineinstrs | FileCheck %s --check-prefix=RV64ZICOND
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+zicond -verify-machineinstrs | FileCheck %s --check-prefix=RV32ZICOND
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zicond -verify-machineinstrs | FileCheck %s --check-prefix=RV64ZICOND
 
 ;
 ; Get the actual value of the overflow bit.
diff --git a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
index 5bf2adbeb75c95..07eb67df6e5f7e 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
@@ -11,3 +11,12 @@ entry:
   tail call void asm sideeffect "faddq $0,$1,$2", "{f38},{f0},{f0}"(fp128 0xL0, fp128 0xL0, fp128 0xL0)
   ret void
 }
+
+; CHECK-label:test_twinword_error
+; CHECK: error: Hi part of pair should point to an even-numbered register
+; CHECK: error: (note that in some cases it might be necessary to manually bind the input/output registers instead of relying on automatic allocation)
+
+define i64 @test_twinword_error(){
+  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i1}"()
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/SPARC/inlineasm.ll b/llvm/test/CodeGen/SPARC/inlineasm.ll
index 8bf34bf1609c18..efb7f7c15220c2 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm.ll
@@ -143,3 +143,12 @@ entry:
   %1 = call double asm sideeffect "faddd $1, $2, $0", "=f,f,e"(i64 0, i64 0)
   ret void
 }
+
+; CHECK-label:test_twinword
+; CHECK: rd  %asr5, %i1
+; CHECK: srlx %i1, 32, %i0
+
+define i64 @test_twinword(){
+  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i0}"()
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll
new file mode 100644
index 00000000000000..3587ecb7f3c94a
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll
@@ -0,0 +1,25 @@
+;; Test reserving argument registers.
+; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0
+; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0
+; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0
+; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0
+
+; CHECK-RESERVED-O0: error:
+; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+; CHECK-RESERVED-I0: error:
+; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+define void @call_function() {
+  call void @foo()
+  ret void
+}
+declare void @foo()
+
+; CHECK-RESERVED-O0: error:
+; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+; CHECK-RESERVED-I0: error:
+; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+define void @call_function_with_arg(i8 %in) {
+  call void @bar(i8 %in)
+  ret void
+}
+declare void @bar(i8)
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-named.ll b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll
new file mode 100644
index 00000000000000..91808be156c559
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l0 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L0
+
+;; Ensure explicit register references are catched as well.
+
+; CHECK-RESERVED-L0: %l0
+define void @set_reg(i32 zeroext %x) {
+entry:
+  tail call void @llvm.write_register.i32(metadata !0, i32 %x)
+  ret void
+}
+
+declare void @llvm.write_register.i32(metadata, i32)
+!0 = !{!"l0"}
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll
new file mode 100644
index 00000000000000..53ca045f100443
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll
@@ -0,0 +1,14 @@
+; RUN: not --crash llc -mtriple=sparc64-linux-gnu -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-L0
+
+;; Ensure explicit register references for non-reserved registers
+;; are caught properly.
+
+; CHECK-RESERVED-L0: LLVM ERROR: Invalid register name global variable
+define void @set_reg(i32 zeroext %x) {
+entry:
+  tail call void @llvm.write_register.i32(metadata !0, i32 %x)
+  ret void
+}
+
+declare void @llvm.write_register.i32(metadata, i32)
+!0 = !{!"l0"}
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs.ll b/llvm/test/CodeGen/SPARC/reserved-regs.ll
index ec6290586eeef2..7dea1f31538b84 100644
--- a/llvm/test/CodeGen/SPARC/reserved-regs.ll
+++ b/llvm/test/CodeGen/SPARC/reserved-regs.ll
@@ -1,5 +1,14 @@
 ; RUN: llc -march=sparc -verify-machineinstrs < %s | FileCheck %s
 
+;; Test reserve-* options.
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-o1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-O1
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L1
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-I1
+
+;; Test multiple reserve-* options together.
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -mattr=+reserve-o1 -mattr=+reserve-l1 -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1,CHECK-RESERVED-O1,CHECK-RESERVED-L1,CHECK-RESERVED-I1
+
 @g = common global [32 x i32] zeroinitializer, align 16
 @h = common global [16 x i64] zeroinitializer, align 16
 
@@ -16,6 +25,10 @@
 ; CHECK-NOT: %o6
 ; CHECK-NOT: %i6
 ; CHECK-NOT: %i7
+; CHECK-RESERVED-G1-NOT: %g1
+; CHECK-RESERVED-O1-NOT: %o1
+; CHECK-RESERVED-L1-NOT: %l1
+; CHECK-RESERVED-I1-NOT: %i1
 ; CHECK: ret
 define void @use_all_i32_regs() {
 entry:
@@ -100,6 +113,10 @@ entry:
 ; CHECK-NOT: %o7
 ; CHECK-NOT: %i6
 ; CHECK-NOT: %i7
+; CHECK-RESERVED-G1-NOT: %g1
+; CHECK-RESERVED-O1-NOT: %o1
+; CHECK-RESERVED-L1-NOT: %l1
+; CHECK-RESERVED-I1-NOT: %i1
 ; CHECK: ret
 define void @use_all_i64_regs() {
 entry:
diff --git a/llvm/test/CodeGen/SystemZ/branch-folder-hoist-livein.mir b/llvm/test/CodeGen/SystemZ/branch-folder-hoist-livein.mir
index 5e100b88ead300..82e3bae97ec0c4 100644
--- a/llvm/test/CodeGen/SystemZ/branch-folder-hoist-livein.mir
+++ b/llvm/test/CodeGen/SystemZ/branch-folder-hoist-livein.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -verify-machineinstrs -O1 -mtriple=s390x-ibm-linux -o - %s -run-pass=branch-folder | FileCheck %s
 --- |
   target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"
@@ -15,6 +16,30 @@
 name:            f1
 tracksRegLiveness: true
 body:             |
+  ; CHECK-LABEL: name: f1
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x7fffffff), %bb.1(0x00000001)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r1d = LGRL @b :: (load (s32) from got, align 8)
+  ; CHECK-NEXT:   renamable $r1l = LH killed renamable $r1d, 0, $noreg, implicit-def $r1d :: (dereferenceable load (s8) from @b)
+  ; CHECK-NEXT:   renamable $r2l = LHI 0
+  ; CHECK-NEXT:   renamable $r3d = LGRL @d :: (load (s32) from got, align 8)
+  ; CHECK-NEXT:   renamable $r4d = LLILL 0, implicit-def $r4q
+  ; CHECK-NEXT:   renamable $r4d = COPY killed renamable $r4d, implicit killed $r4q
+  ; CHECK-NEXT:   CHI killed renamable $r2l, 0, implicit-def $cc
+  ; CHECK-NEXT:   BRC 14, 6, %bb.2, implicit killed $cc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors:
+  ; CHECK-NEXT:   liveins: $r3d, $r4d, $r1l
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   STH renamable $r1l, killed renamable $r3d, 0, $noreg, implicit killed $r4d :: (store (s8) into @d)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $r3d, $r4d, $r1l
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   STH renamable $r1l, killed renamable $r3d, 0, $noreg, implicit killed $r4d :: (store (s8) into @d)
+  ; CHECK-NEXT:   Return
   bb.0:
     successors: %bb.2(0x7fffffff), %bb.1(0x00000001)
     liveins:
@@ -44,14 +69,3 @@ body:             |
     Return
 
 ...
-
-# CHECK: renamable $r4d = COPY killed renamable $r4d, implicit killed $r4q
-# CHECK-NEXT: CHI killed renamable $r2l, 0, implicit-def $cc
-# CHECK-NEXT: BRC 14, 6, %bb.2, implicit killed $cc
-# CHECK-NEXT: {{^  $}}
-# CHECK-NEXT: bb.1:
-# CHECK-NEXT: successors:
-# CHECK-NEXT: liveins: $r1l, $r3d, $r4d
-
-# CHECK: bb.2:
-# CHECK-NEXT: liveins: $r1l, $r3d, $r4d
diff --git a/llvm/test/CodeGen/SystemZ/frame-29.ll b/llvm/test/CodeGen/SystemZ/frame-29.ll
new file mode 100644
index 00000000000000..6cc0d9e985e160
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/frame-29.ll
@@ -0,0 +1,18 @@
+; RUN: llc %s -o - -mtriple=s390x-linux-gnu -mcpu=z16 -print-after=finalize-isel 2>&1 | FileCheck %s
+;
+; Test that the correct space is allocated for the outgoing stack argument.
+
+declare void @bar(i72 %Arg);
+
+define void @foo() {
+; CHECK-LABEL: # Machine code for function foo: IsSSA, TracksLiveness
+; CHECK-NEXT: Frame Objects:
+; CHECK-NEXT:   fi#0: size=1, align=2, at location [SP]
+; CHECK-NEXT:   fi#1: size=16, align=8, at location [SP]
+
+; CHECK-LABEL: foo:
+; CHECK: aghi %r15, -184
+  %1 = alloca i8, align 2
+  tail call fastcc void @bar(i72 2097168)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/int-usub-12.ll b/llvm/test/CodeGen/SystemZ/int-usub-12.ll
index c39a6da37048d3..147fbfd920a9dc 100644
--- a/llvm/test/CodeGen/SystemZ/int-usub-12.ll
+++ b/llvm/test/CodeGen/SystemZ/int-usub-12.ll
@@ -11,6 +11,7 @@ define zeroext i1 @f1(i128 %a, i128 %b, ptr %res) {
 ; CHECK-NEXT:    vscbiq %v2, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v2, 1
 ; CHECK-NEXT:    vsq %v0, %v1, %v0
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    vst %v0, 0(%r4), 3
 ; CHECK-NEXT:    br %r14
   %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b)
@@ -27,6 +28,7 @@ define zeroext i1 @f2(i128 %a, i128 %b) {
 ; CHECK-NEXT:    vl %v1, 0(%r2), 3
 ; CHECK-NEXT:    vscbiq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    br %r14
   %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b)
   %obit = extractvalue {i128, i1} %t, 1
@@ -46,5 +48,25 @@ define i128 @f3(i128 %a, i128 %b) {
   ret i128 %val
 }
 
+define i128 @f4(i128 %a, i128 %b) {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v0, 0(%r4), 3
+; CHECK-NEXT:    vl %v1, 0(%r3), 3
+; CHECK-NEXT:    vscbiq %v2, %v1, %v0
+; CHECK-NEXT:    vlgvf %r0, %v2, 3
+; CHECK-NEXT:    vgbm %v2, 0
+; CHECK-NEXT:    xilf %r0, 1
+; CHECK-NEXT:    jl .LBB3_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vsq %v2, %v1, %v0
+; CHECK-NEXT:  .LBB3_2:
+; CHECK-NEXT:    vst %v2, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %val = call i128 @llvm.usub.sat.i128(i128 %a, i128 %b)
+  ret i128 %val
+}
+
 declare {i128, i1} @llvm.usub.with.overflow.i128(i128, i128) nounwind readnone
+declare i128 @llvm.usub.sat.i128(i128, i128) nounwind readnone
 
diff --git a/llvm/test/CodeGen/SystemZ/int-usub-13.ll b/llvm/test/CodeGen/SystemZ/int-usub-13.ll
index 637e1a81de996f..794af3b73fbe2a 100644
--- a/llvm/test/CodeGen/SystemZ/int-usub-13.ll
+++ b/llvm/test/CodeGen/SystemZ/int-usub-13.ll
@@ -15,6 +15,7 @@ define zeroext i1 @f1(i256 %a, i256 %b, ptr %res) {
 ; CHECK-NEXT:    vlgvg %r2, %v5, 1
 ; CHECK-NEXT:    vsbiq %v0, %v1, %v0, %v4
 ; CHECK-NEXT:    vsq %v1, %v3, %v2
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    vst %v1, 16(%r4), 3
 ; CHECK-NEXT:    vst %v0, 0(%r4), 3
 ; CHECK-NEXT:    br %r14
@@ -35,6 +36,7 @@ define zeroext i1 @f2(i256 %a, i256 %b) {
 ; CHECK-NEXT:    vscbiq %v2, %v3, %v2
 ; CHECK-NEXT:    vsbcbiq %v0, %v1, %v0, %v2
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    br %r14
   %t = call {i256, i1} @llvm.usub.with.overflow.i256(i256 %a, i256 %b)
   %obit = extractvalue {i256, i1} %t, 1
diff --git a/llvm/test/CodeGen/SystemZ/loop-01.ll b/llvm/test/CodeGen/SystemZ/loop-01.ll
index 15dfae73c97bc9..554c248f8dbf3b 100644
--- a/llvm/test/CodeGen/SystemZ/loop-01.ll
+++ b/llvm/test/CodeGen/SystemZ/loop-01.ll
@@ -312,3 +312,26 @@ for.inc.i:                                        ; preds = %for.body.i63
   %indvars.iv.next156.i.3 = add nsw i64 %indvars.iv155.i, 4
   br label %for.body.i63
 }
+
+; Test that offsets are in range for i128 memory accesses.
+define void @fun10() {
+; CHECK-Z13-LABEL: fun10:
+; CHECK-Z13: # =>This Inner Loop Header: Depth=1
+; CHECK-Z13-NOT: lay
+entry:
+  %A1 = alloca [3 x [7 x [10 x i128]]], align 8
+  br label %for.body
+
+for.body:                        ; preds = %for.body, %entry
+  %IV = phi i64 [ 0, %entry ], [ %IV.next, %for.body ]
+  %Addr1 = getelementptr inbounds [3 x [7 x [10 x i128]]], ptr %A1, i64 0, i64 %IV, i64 6, i64 6
+  store i128 17174966165894859678, ptr %Addr1, align 8
+  %Addr2 = getelementptr inbounds [3 x [7 x [10 x i128]]], ptr %A1, i64 0, i64 %IV, i64 6, i64 8
+  store i128 17174966165894859678, ptr %Addr2, align 8
+  %IV.next = add nuw nsw i64 %IV, 1
+  %exitcond.not.i.i = icmp eq i64 %IV.next, 3
+  br i1 %exitcond.not.i.i, label %exit, label %for.body
+
+exit:                        ; preds = %for.body
+  unreachable
+}
diff --git a/llvm/test/CodeGen/SystemZ/zos-ppa2.ll b/llvm/test/CodeGen/SystemZ/zos-ppa2.ll
index f54f654b804a23..60580aeb6d83cc 100644
--- a/llvm/test/CodeGen/SystemZ/zos-ppa2.ll
+++ b/llvm/test/CodeGen/SystemZ/zos-ppa2.ll
@@ -24,7 +24,7 @@
 ; CHECK:    .byte   0
 ; CHECK:    .byte   3
 ; CHECK:    .short  30
-; CHECK:    .ascii  "\323\323\345\324@@@@@@\361\370\360\360\361\371\367\360\360\361\360\361\360\360\360\360\360\360\360\360"
+; CHECK:    .ascii  "\323\323\345\324@@@@@@{{((\\3[0-7]{2}){4})}}\361\371\367\360\360\361\360\361\360\360\360\360\360\360\360\360"
 define void @void_test() {
 entry:
   ret void
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir
index 1bd1d6b99e4225..15aa62d5cff6b5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir
@@ -336,7 +336,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x04000000), %bb.2(0x7c000000)
-  ; CHECK-NEXT:   liveins: $q0, $r0, $r1, $r2, $r3, $r6, $r12
+  ; CHECK-NEXT:   liveins: $d0, $d1, $r0, $r1, $r2, $r3, $r6, $r12
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $r3, dead $cpsr = nuw nsw tADDi8 killed renamable $r3, 1, 14 /* CC::al */, $noreg
   ; CHECK-NEXT:   renamable $r0 = tADDhirr killed renamable $r0, renamable $r1, 14 /* CC::al */, $noreg
diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
new file mode 100644
index 00000000000000..0ce346f7c2e02c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64 -- | FileCheck %s
+
+define i32 @h(i1 %arg, i32 %arg1) {
+; CHECK-LABEL: h:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    movabsq $9166129423, %rcx # imm = 0x22258090F
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_9: # %bb18
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB0_10
+; CHECK-NEXT:  .LBB0_1: # %bb4
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    testq %rdx, %rdx
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.7: # %bb16
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB0_9
+; CHECK-NEXT:  # %bb.8: # %bb17
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movq %rcx, %rdx
+; CHECK-NEXT:    jmp .LBB0_9
+; CHECK-NEXT:  .LBB0_2: # %bb9
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    je .LBB0_4
+; CHECK-NEXT:  # %bb.3: # %bb13
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:  .LBB0_4: # %bb14
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    cmpl $1, %esi
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.5: # %bb14
+; CHECK-NEXT:    movl %eax, %r8d
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    jne .LBB0_6
+; CHECK-NEXT:  .LBB0_10: # %bb22
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_6: # %bb22.loopexit1
+; CHECK-NEXT:    movl %r8d, %eax
+; CHECK-NEXT:    retq
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb14, %bb
+  %i = phi i64 [ %i5, %bb14 ], [ 0, %bb ]
+  %i3 = phi i32 [ %i15, %bb14 ], [ 1, %bb ]
+  br label %bb4
+
+bb4:                                              ; preds = %bb18, %bb2
+  %i5 = phi i64 [ %i19, %bb18 ], [ %i, %bb2 ]
+  %i6 = phi i64 [ %i20, %bb18 ], [ %i, %bb2 ]
+  %i7 = phi i32 [ 0, %bb18 ], [ %i3, %bb2 ]
+  %i8 = icmp eq i64 %i6, 0
+  br i1 %i8, label %bb16, label %bb9
+
+bb9:                                              ; preds = %bb4
+  br i1 %arg, label %bb12, label %bb10
+
+bb10:                                             ; preds = %bb9
+  %i11 = sdiv i64 0, 0
+  br label %bb12
+
+bb12:                                             ; preds = %bb10, %bb9
+  br i1 %arg, label %bb13, label %bb14
+
+bb13:                                             ; preds = %bb12
+  br label %bb14
+
+bb14:                                             ; preds = %bb13, %bb12
+  %i15 = phi i32 [ 0, %bb13 ], [ %i7, %bb12 ]
+  switch i32 %arg1, label %bb22 [
+    i32 0, label %bb21
+    i32 1, label %bb2
+  ]
+
+bb16:                                             ; preds = %bb4
+  br i1 %arg, label %bb18, label %bb17
+
+bb17:                                             ; preds = %bb16
+  br label %bb18
+
+bb18:                                             ; preds = %bb17, %bb16
+  %i19 = phi i64 [ 9166129423, %bb17 ], [ %i5, %bb16 ]
+  %i20 = phi i64 [ 9166129423, %bb17 ], [ %i6, %bb16 ]
+  br i1 %arg, label %bb22, label %bb4
+
+bb21:                                             ; preds = %bb14
+  br label %bb22
+
+bb22:                                             ; preds = %bb21, %bb18, %bb14
+  %i23 = phi i32 [ %arg1, %bb21 ], [ %i15, %bb14 ], [ 0, %bb18 ]
+  ret i32 %i23
+}
diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll
index 3fc4ed99fad0fa..f8d32fc2d29252 100644
--- a/llvm/test/CodeGen/X86/addcarry.ll
+++ b/llvm/test/CodeGen/X86/addcarry.ll
@@ -1490,3 +1490,26 @@ define { i64, i64 } @addcarry_commutative_2(i64 %x0, i64 %x1, i64 %y0, i64 %y1)
   %r1 = insertvalue { i64, i64 } %r0, i64 %b1s, 1
   ret { i64, i64 } %r1
 }
+
+define i1 @pr84831(i64 %arg) {
+; CHECK-LABEL: pr84831:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    addb $-1, %al
+; CHECK-NEXT:    adcq $1, %rcx
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    retq
+  %a = icmp ult i64 0, %arg
+  %add1 = add i64 0, 1
+  %carryout1 = icmp ult i64 %add1, 0
+  %b = zext i1 %a to i64
+  %add2 = add i64 %add1, %b
+  %carryout2 = icmp ult i64 %add2, %add1
+  %zc1 = zext i1 %carryout1 to i63
+  %zc2 = zext i1 %carryout2 to i63
+  %or = or i63 %zc1, %zc2
+  %trunc = trunc i63 %or to i1
+  ret i1 %trunc
+}
diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
index 0826faa1071b01..482713e12d15c7 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
@@ -381,3 +381,25 @@ entry:
   %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
   ret <16 x bfloat> %1
 }
+
+define <16 x i32> @pr83358() {
+; X86-LABEL: pr83358:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
+; X86-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-NEXT:    vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
+; X86-NEXT:    # zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: pr83358:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
+; X64-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-NEXT:    vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
+; X64-NEXT:    # zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+  %2 = bitcast <8 x bfloat> %1 to <4 x i32>
+  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %3
+}
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
index 4988fc35b10eef..fdc25f44b156a7 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,EVEX256
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s --check-prefixes=CHECK,EVEX512
 
 ; 256-bit
 
@@ -236,3 +236,34 @@ define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) {
   ret <8 x i16> %x
 }
 
+define i16 @PR90356(<16 x i1> %a) {
+; EVEX256-LABEL: PR90356:
+; EVEX256:       # %bb.0:
+; EVEX256-NEXT:    vpsllw $7, %xmm0, %xmm0
+; EVEX256-NEXT:    vpmovb2m %xmm0, %k1
+; EVEX256-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; EVEX256-NEXT:    movb $63, %al
+; EVEX256-NEXT:    kmovd %eax, %k1
+; EVEX256-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; EVEX256-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; EVEX256-NEXT:    kmovd %k0, %eax
+; EVEX256-NEXT:    # kill: def $ax killed $ax killed $eax
+; EVEX256-NEXT:    vzeroupper
+; EVEX256-NEXT:    retq
+;
+; EVEX512-LABEL: PR90356:
+; EVEX512:       # %bb.0:
+; EVEX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; EVEX512-NEXT:    vpmovb2m %xmm0, %k0
+; EVEX512-NEXT:    vpmovm2w %k0, %ymm0
+; EVEX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; EVEX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; EVEX512-NEXT:    vpmovw2m %ymm0, %k0
+; EVEX512-NEXT:    kmovd %k0, %eax
+; EVEX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; EVEX512-NEXT:    vzeroupper
+; EVEX512-NEXT:    retq
+  %1 = shufflevector <16 x i1> %a, <16 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+  %2 = bitcast <16 x i1> %1 to i16
+  ret i16 %2
+}
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index f2d3c4fb34199e..cd1dba17611628 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -2423,7 +2423,6 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
 ; AVXNC-LABEL: fptrunc_v16f32:
 ; AVXNC:       # %bb.0:
 ; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
-; AVXNC-NEXT:    vinsertf128 $0, %xmm0, %ymm0, %ymm0
 ; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
 ; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVXNC-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/code-model-elf.ll b/llvm/test/CodeGen/X86/code-model-elf.ll
index afcffb3a7adeda..b6634403dc1d05 100644
--- a/llvm/test/CodeGen/X86/code-model-elf.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf.ll
@@ -346,7 +346,7 @@ define dso_local ptr @lea_forced_small_data() #0 {
 ;
 ; LARGE-STATIC-LABEL: lea_forced_small_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movl $forced_small_data, %eax
+; LARGE-STATIC-NEXT:    movabsq $forced_small_data, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_forced_small_data:
@@ -399,7 +399,7 @@ define dso_local i32 @load_forced_small_data() #0 {
 ;
 ; LARGE-STATIC-LABEL: load_forced_small_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movl $forced_small_data+8, %eax
+; LARGE-STATIC-NEXT:    movabsq $forced_small_data+8, %rax
 ; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index d223b75419ac47..294fcd6a9563eb 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -1171,6 +1171,25 @@ define <4 x i32> @neg_scalar_broadcast_two_uses(i32 %a0, <4 x i32> %a1, ptr %a2)
   ret <4 x i32> %4
 }
 
+; PR84660 - check for illegal types
+define <2 x i128> @neg_scalar_broadcast_illegaltype(i128 %arg) {
+; CHECK-LABEL: neg_scalar_broadcast_illegaltype:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notl %esi
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    movq %rsi, 16(%rdi)
+; CHECK-NEXT:    movq %rsi, (%rdi)
+; CHECK-NEXT:    movq $0, 24(%rdi)
+; CHECK-NEXT:    movq $0, 8(%rdi)
+; CHECK-NEXT:    retq
+  %i = xor i128 %arg, 1
+  %i1 = insertelement <2 x i128> zeroinitializer, i128 %i, i64 0
+  %i2 = shufflevector <2 x i128> %i1, <2 x i128> zeroinitializer, <2 x i32> zeroinitializer
+  %i3 = and <2 x i128> <i128 1, i128 1>, %i2
+  ret <2 x i128> %i3
+}
+
 define <2 x i64> @andnp_xx(<2 x i64> %v0) nounwind {
 ; SSE-LABEL: andnp_xx:
 ; SSE:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/commute-blend-avx2.ll b/llvm/test/CodeGen/X86/commute-blend-avx2.ll
index b5ffe78d29a610..75511104580e90 100644
--- a/llvm/test/CodeGen/X86/commute-blend-avx2.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-avx2.ll
@@ -88,3 +88,12 @@ define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, ptr %b) #0 {
   ret <4 x double> %2
 }
 declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x float> @commute_vblendpd_128_for_code_size(<4 x float> %a, <4 x float> %b) optsize {
+; CHECK-LABEL: commute_vblendpd_128_for_code_size:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x float> %r
+}
diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
index 7f8115dc1ce389..b5f3e789918813 100644
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -1282,3 +1282,35 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(ptr %arg) {
   %tmp8 = or i32 %tmp7, %tmp30
   ret i32 %tmp8
 }
+
+define i32 @pr80911_vector_load_multiuse(ptr %ptr, ptr %clobber) nounwind {
+; CHECK-LABEL: pr80911_vector_load_multiuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl (%edx), %esi
+; CHECK-NEXT:    movzwl (%edx), %eax
+; CHECK-NEXT:    movl $0, (%ecx)
+; CHECK-NEXT:    movl %esi, (%edx)
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: pr80911_vector_load_multiuse:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movl (%rdi), %ecx
+; CHECK64-NEXT:    movzwl (%rdi), %eax
+; CHECK64-NEXT:    movl $0, (%rsi)
+; CHECK64-NEXT:    movl %ecx, (%rdi)
+; CHECK64-NEXT:    retq
+  %load = load <4 x i8>, ptr %ptr, align 16
+  store i32 0, ptr %clobber
+  store <4 x i8> %load, ptr %ptr, align 16
+  %e1 = extractelement <4 x i8> %load, i64 1
+  %e1.ext = zext i8 %e1 to i32
+  %e1.ext.shift = shl nuw nsw i32 %e1.ext, 8
+  %e0 = extractelement <4 x i8> %load, i64 0
+  %e0.ext = zext i8 %e0 to i32
+  %res = or i32 %e1.ext.shift, %e0.ext
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll b/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll
new file mode 100644
index 00000000000000..a0e5b4add1b386
--- /dev/null
+++ b/llvm/test/CodeGen/X86/note-cet-property-inlineasm.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple x86_64-unknown-linux-gnu %s -o %t.o -filetype=obj
+; RUN: llvm-readobj -n %t.o | FileCheck %s
+
+module asm ".pushsection \22.note.gnu.property\22,\22a\22,@note"
+module asm "     .p2align 3"
+module asm "     .long 1f - 0f"
+module asm "     .long 4f - 1f"
+module asm "     .long 5"
+module asm "0:   .asciz \22GNU\22"
+module asm "1:   .p2align 3"
+module asm "     .long 0xc0008002"
+module asm "     .long 3f - 2f"
+module asm "2:   .long ((1U << 0) | 0 | 0 | 0)"
+module asm "3:   .p2align 3"
+module asm "4:"
+module asm " .popsection"
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 4, !"cf-protection-return", i32 1}
+!1 = !{i32 4, !"cf-protection-branch", i32 1}
+
+; CHECK:      Type: NT_GNU_PROPERTY_TYPE_0
+; CHECK-NEXT: Property [
+; CHECK-NEXT:   x86 feature: IBT, SHSTK
+; CHECK-NEXT: ]
+; CHECK:      Type: NT_GNU_PROPERTY_TYPE_0
+; CHECK-NEXT: Property [
+; CHECK-NEXT:   x86 ISA needed: x86-64-baseline
+; CHECK-NEXT: ]
diff --git a/llvm/test/CodeGen/X86/patchable-prologue.ll b/llvm/test/CodeGen/X86/patchable-prologue.ll
index 71a392845fdea3..43761e3d1e1eb9 100644
--- a/llvm/test/CodeGen/X86/patchable-prologue.ll
+++ b/llvm/test/CodeGen/X86/patchable-prologue.ll
@@ -193,3 +193,20 @@ do.body:                                          ; preds = %do.body, %entry
 do.end:                                           ; preds = %do.body
   ret void
 }
+
+
+; Test that inline asm is properly hotpatched. We currently don't examine the
+; asm instruction when printing it, thus we always emit patching NOPs.
+
+; 64: inline_asm:
+; 64-NEXT: # %bb.0:
+; 64-NEXT: xchgw   %ax, %ax                        # encoding: [0x66,0x90]
+; 64-NEXT: #APP
+; 64-NEXT: int3                                    # encoding: [0xcc]
+; 64-NEXT: #NO_APP
+
+define dso_local void @inline_asm() "patchable-function"="prologue-short-redirect" {
+entry:
+  call void asm sideeffect "int3", "~{dirflag},~{fpsr},~{flags}"()
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/sar_fold.ll b/llvm/test/CodeGen/X86/sar_fold.ll
index 21655e19440afe..0f1396954b03a1 100644
--- a/llvm/test/CodeGen/X86/sar_fold.ll
+++ b/llvm/test/CodeGen/X86/sar_fold.ll
@@ -44,3 +44,44 @@ define i32 @shl24sar25(i32 %a) #0 {
   %2 = ashr exact i32 %1, 25
   ret i32 %2
 }
+
+define void @shl144sar48(ptr %p) #0 {
+; CHECK-LABEL: shl144sar48:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movswl (%eax), %ecx
+; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    sarl $31, %edx
+; CHECK-NEXT:    shldl $2, %ecx, %edx
+; CHECK-NEXT:    shll $2, %ecx
+; CHECK-NEXT:    movl %ecx, 12(%eax)
+; CHECK-NEXT:    movl %edx, 16(%eax)
+; CHECK-NEXT:    movl $0, 8(%eax)
+; CHECK-NEXT:    movl $0, 4(%eax)
+; CHECK-NEXT:    movl $0, (%eax)
+; CHECK-NEXT:    retl
+  %a = load i160, ptr %p
+  %1 = shl i160 %a, 144
+  %2 = ashr exact i160 %1, 46
+  store i160 %2, ptr %p
+  ret void
+}
+
+define void @shl144sar2(ptr %p) #0 {
+; CHECK-LABEL: shl144sar2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movswl (%eax), %ecx
+; CHECK-NEXT:    shll $14, %ecx
+; CHECK-NEXT:    movl %ecx, 16(%eax)
+; CHECK-NEXT:    movl $0, 8(%eax)
+; CHECK-NEXT:    movl $0, 12(%eax)
+; CHECK-NEXT:    movl $0, 4(%eax)
+; CHECK-NEXT:    movl $0, (%eax)
+; CHECK-NEXT:    retl
+  %a = load i160, ptr %p
+  %1 = shl i160 %a, 144
+  %2 = ashr exact i160 %1, 2
+  store i160 %2, ptr %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index cf45641fba6321..3316a332fafdff 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -787,3 +787,38 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
   %r = or <4 x i32> %or.ab, %or.cd
   ret <4 x i32> %r
 }
+
+; Reproducer for a DAGCombiner::combineShiftOfShiftedLogic bug. DAGCombiner
+; need to check that the sum of the shift amounts fits in i8, which is the
+; legal type used to described X86 shift amounts. Verify that we do not try to
+; create a shift with 130+160 as shift amount, and verify that the stored
+; value do not depend on %a1.
+define void @combineShiftOfShiftedLogic(i128 %a1, i32 %a2, ptr %p) {
+; X86-LABEL: combineShiftOfShiftedLogic:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, 20(%ecx)
+; X86-NEXT:    movl $0, 16(%ecx)
+; X86-NEXT:    movl $0, 12(%ecx)
+; X86-NEXT:    movl $0, 8(%ecx)
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $0, (%ecx)
+; X86-NEXT:    retl
+;
+; X64-LABEL: combineShiftOfShiftedLogic:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-NEXT:    shlq $32, %rdx
+; X64-NEXT:    movq %rdx, 16(%rcx)
+; X64-NEXT:    movq $0, 8(%rcx)
+; X64-NEXT:    movq $0, (%rcx)
+; X64-NEXT:    retq
+  %zext1 = zext i128 %a1 to i192
+  %zext2 = zext i32 %a2 to i192
+  %shl = shl i192 %zext1, 130
+  %or = or i192 %shl, %zext2
+  %res = shl i192 %or, 160
+  store i192 %res, ptr %p, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/tls-models.ll b/llvm/test/CodeGen/X86/tls-models.ll
index fc8e302338d960..8de9de15a5f06e 100644
--- a/llvm/test/CodeGen/X86/tls-models.ll
+++ b/llvm/test/CodeGen/X86/tls-models.ll
@@ -5,6 +5,7 @@
 
 ; Darwin always uses the same model.
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=DARWIN %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -code-model=large | FileCheck -check-prefix=DARWIN %s
 
 @external_gd = external thread_local global i32
 @internal_gd = internal thread_local global i32 42
diff --git a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
index e03b4c1d34de15..07443a62b93391 100644
--- a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
+++ b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
@@ -1,19 +1,22 @@
 ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=-relax %s -o %t.o
 ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-NORL %s
-; RUN: llvm-objdump --source %t.o | FileCheck --check-prefix=SOURCE %s
-; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefix=DWARF %s
+; RUN: llvm-objdump --source %t.o | FileCheck --check-prefixes=SOURCE,SOURCE-NORL %s
+; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefixes=DWARF,DWARF-NORL %s
 
 ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=+relax %s -o %t.r.o
 ; RUN: llvm-readobj -r %t.r.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-ENRL %s
-; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefix=SOURCE %s
-; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefix=DWARF %s
+; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefixes=SOURCE,SOURCE-ENRL %s
+; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefixes=DWARF,DWARF-ENRL %s
 
 ; RELOCS-BOTH:       Relocations [
 ; RELOCS-BOTH-NEXT:    Section ({{.*}}) .rela.text {
-; RELOCS-BOTH-NEXT:      0x14 R_LARCH_PCALA_HI20 sym 0x0
-; RELOCS-ENRL-NEXT:      0x14 R_LARCH_RELAX - 0x0
-; RELOCS-BOTH-NEXT:      0x18 R_LARCH_PCALA_LO12 sym 0x0
-; RELOCS-ENRL-NEXT:      0x18 R_LARCH_RELAX - 0x0
+; RELOCS-NORL-NEXT:      0x14 R_LARCH_PCALA_HI20 sym 0x0
+; RELOCS-NORL-NEXT:      0x18 R_LARCH_PCALA_LO12 sym 0x0
+; RELOCS-ENRL-NEXT:      0x0 R_LARCH_ALIGN .Lla-relax-align0 0x5
+; RELOCS-ENRL-NEXT:      0x30 R_LARCH_PCALA_HI20 sym 0x0
+; RELOCS-ENRL-NEXT:      0x30 R_LARCH_RELAX - 0x0
+; RELOCS-ENRL-NEXT:      0x34 R_LARCH_PCALA_LO12 sym 0x0
+; RELOCS-ENRL-NEXT:      0x34 R_LARCH_RELAX - 0x0
 ; RELOCS-BOTH-NEXT:    }
 ; RELOCS-BOTH:         Section ({{.*}}) .rela.debug_frame {
 ; RELOCS-NORL-NEXT:      0x1C R_LARCH_32 .debug_frame 0x0
@@ -36,7 +39,8 @@
 ; RELOCS-BOTH-NEXT:    }
 ; RELOCS-BOTH-NEXT:  ]
 
-; SOURCE:  0000000000000000 <foo>:
+; SOURCE-NORL:  0000000000000000 <foo>:
+; SOURCE-ENRL:  000000000000001c <foo>:
 ; SOURCE:  ; {
 ; SOURCE:  ;   asm volatile(
 ; SOURCE:  ;   return 0;
@@ -87,11 +91,16 @@
 ; DWARF-EMPTY:
 ; DWARF-NEXT:  Address            Line   Column File   ISA Discriminator OpIndex Flags
 ; DWARF-NEXT:  ------------------ ------ ------ ------ --- ------------- ------- -------------
-; DWARF-NEXT:  0x0000000000000000      2      0      0   0             0       0  is_stmt
-; DWARF-NEXT:  0x0000000000000010      3      3      0   0             0       0  is_stmt prologue_end
-; DWARF-NEXT:  0x0000000000000020     10      3      0   0             0       0  is_stmt
-; DWARF-NEXT:  0x000000000000002c     10      3      0   0             0       0  epilogue_begin
-; DWARF-NEXT:  0x0000000000000034     10      3      0   0             0       0  end_sequence
+; DWARF-NORL-NEXT:  0x0000000000000000      2      0      0   0             0       0  is_stmt
+; DWARF-NORL-NEXT:  0x0000000000000010      3      3      0   0             0       0  is_stmt prologue_end
+; DWARF-NORL-NEXT:  0x0000000000000020     10      3      0   0             0       0  is_stmt
+; DWARF-NORL-NEXT:  0x000000000000002c     10      3      0   0             0       0  epilogue_begin
+; DWARF-NORL-NEXT:  0x0000000000000034     10      3      0   0             0       0  end_sequence
+; DWARF-ENRL-NEXT:  0x000000000000001c      2      0      0   0             0       0  is_stmt
+; DWARF-ENRL-NEXT:  0x000000000000002c      3      3      0   0             0       0  is_stmt prologue_end
+; DWARF-ENRL-NEXT:  0x000000000000003c     10      3      0   0             0       0  is_stmt
+; DWARF-ENRL-NEXT:  0x0000000000000048     10      3      0   0             0       0  epilogue_begin
+; DWARF-ENRL-NEXT:  0x0000000000000050     10      3      0   0             0       0  end_sequence
 
 ; ModuleID = 'dwarf-loongarch-relocs.c'
 source_filename = "dwarf-loongarch-relocs.c"
diff --git a/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll b/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll
index 76afc4bf007c2d..8b387cd4962979 100644
--- a/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll
+++ b/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll
@@ -78,6 +78,26 @@ entry:
 ; CHECK-LABEL: atomic8_xchg_monotonic
 ; CHECK: call i8 @__tsan_atomic8_exchange(ptr %a, i8 0, i32 0), !dbg
 
+define void @atomic8_xchg_monotonic_ptr(ptr %a, ptr %b) nounwind uwtable {
+entry:
+  atomicrmw xchg ptr %a, ptr %b monotonic, !dbg !7
+  ret void, !dbg !7
+}
+; CHECK-LABEL: atomic8_xchg_monotonic_ptr
+; CHECK: [[ARG:%.*]] = ptrtoint ptr %b to i64, !dbg
+; CHECK: [[RES:%.*]] = call i64 @__tsan_atomic64_exchange(ptr %a, i64 [[ARG]], i32 0), !dbg
+; CHECK: [[CAST:%.*]] = inttoptr i64 [[RES]] to ptr, !dbg
+
+define void @atomic8_xchg_monotonic_float(ptr %a, float %b) nounwind uwtable {
+entry:
+  atomicrmw xchg ptr %a, float %b monotonic, !dbg !7
+  ret void, !dbg !7
+}
+; CHECK-LABEL: atomic8_xchg_monotonic_float
+; CHECK: [[ARG:%.*]] = bitcast float %b to i32, !dbg
+; CHECK: [[RES:%.*]] = call i32 @__tsan_atomic32_exchange(ptr %a, i32 [[ARG]], i32 0), !dbg
+; CHECK: [[CAST:%.*]] = bitcast i32 [[RES]] to float, !dbg
+
 define void @atomic8_add_monotonic(ptr %a) nounwind uwtable {
 entry:
   atomicrmw add ptr %a, i8 0 monotonic, !dbg !7
diff --git a/llvm/test/MC/AArch64/armv8.2a-dotprod.s b/llvm/test/MC/AArch64/armv8.2a-dotprod.s
index 9c4a6cad7e07a6..26afbe149dd00b 100644
--- a/llvm/test/MC/AArch64/armv8.2a-dotprod.s
+++ b/llvm/test/MC/AArch64/armv8.2a-dotprod.s
@@ -15,6 +15,7 @@
 // RUN: llvm-mc -triple aarch64 -mattr=+v8r -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=ampere1 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=ampere1a -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
+// RUN: llvm-mc -triple aarch64 -mcpu=ampere1b -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 
 // RUN: not llvm-mc -triple aarch64 -mattr=+v8.2a -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
@@ -42,6 +43,8 @@
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
 // RUN: not llvm-mc -triple aarch64 -mcpu=ampere1a -mattr=-dotprod -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
+// RUN: not llvm-mc -triple aarch64 -mcpu=ampere1b -mattr=-dotprod -show-encoding < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
 
 udot v0.2s, v1.8b, v2.8b
 sdot v0.2s, v1.8b, v2.8b
diff --git a/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s b/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s
index 235b7d44809929..3a5af86defc592 100644
--- a/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s
+++ b/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s
@@ -8,6 +8,10 @@
 	.p2align	2
 _locomotive:
 	.cfi_startproc
+	; An N_ALT_ENTRY symbol can be defined in the middle of a subsection, so
+	; these are opted out of the .cfi_{start,end}proc nesting check.
+	.alt_entry _engineer
+_engineer:
 	ret
 
 	; It is invalid to have a non-private label between .cfi_startproc and
@@ -17,7 +21,7 @@ _locomotive:
 	.p2align	2
 _caboose:
 ; DARWIN: [[#@LINE-1]]:1: error: non-private labels cannot appear between .cfi_startproc / .cfi_endproc pairs
-; DARWIN: [[#@LINE-10]]:2: error: previous .cfi_startproc was here
+; DARWIN: [[#@LINE-14]]:2: error: previous .cfi_startproc was here
 	ret
 	.cfi_endproc
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index 35411ee0ba2a5e..c9c4fceffaeb0c 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -397,6 +397,51 @@ v_ctz_i32_b32 v5, src_scc
 v_ctz_i32_b32 v255, 0xaf123456
 // GFX12: encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
+v_cvt_f32_bf8_e32 v1, s3
+// GFX12: encoding: [0x03,0xda,0x02,0x7e]
+
+v_cvt_f32_bf8_e32 v1, 3
+// GFX12: encoding: [0x83,0xda,0x02,0x7e]
+
+v_cvt_f32_bf8_e32 v1, v3
+// GFX12: encoding: [0x03,0xdb,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, s3
+// GFX12: encoding: [0x03,0xd8,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, 3
+// GFX12: encoding: [0x83,0xd8,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, v3
+// GFX12: encoding: [0x03,0xd9,0x02,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], s3
+// GFX12: encoding: [0x03,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], s5
+// GFX12: encoding: [0x05,0xde,0x06,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], 3
+// GFX12: encoding: [0x83,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], 3
+// GFX12: encoding: [0x83,0xde,0x06,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], v3
+// GFX12: encoding: [0x03,0xdf,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], v3
+// GFX12: encoding: [0x03,0xdf,0x06,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], s3
+// GFX12: encoding: [0x03,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], 3
+// GFX12: encoding: [0x83,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], v3
+// GFX12: encoding: [0x03,0xdd,0x04,0x7e]
+
 v_cvt_f16_f32 v5, v1
 // GFX12: encoding: [0x01,0x15,0x0a,0x7e]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
index dd6afb28c396a7..5e0e1b688bc582 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
@@ -337,6 +337,18 @@ v_ctz_i32_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_ctz_i32_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
+v_cvt_f32_fp8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc
+// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac]
+
+v_cvt_f32_fp8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe
+// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+
+v_cvt_f32_bf8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc
+// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac]
+
+v_cvt_f32_bf8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe
+// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+
 v_cvt_f16_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
index 6530de0268456d..36c89710ce8f89 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
@@ -73,6 +73,18 @@ v_ctz_i32_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_ctz_i32_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
+v_cvt_f32_fp8 v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_fp8 v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_bf8 v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05]
+
 v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index cf3f9c45bdcc8e..beb57999b855ea 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -1099,6 +1099,42 @@ v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
 v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
 // GFX12: encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
+v_cvt_pk_fp8_f32 v1, v2, v3
+// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_pk_fp8_f32 v1, -v2, |v3|
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
+
+v_cvt_pk_fp8_f32 v1, s2, 3
+// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
+
+v_cvt_pk_bf8_f32 v1, v2, v3
+// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_pk_bf8_f32 v1, -v2, |v3|
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
+
+v_cvt_pk_bf8_f32 v1, s2, 3
+// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v10, s2, v5
+// GFX12: encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v5, -|v255|, v4
+// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
+
+v_cvt_sr_bf8_f32 v1, v2, v3
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v10, s2, v5
+// GFX12: encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v5, -|v255|, v4
+// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
+
 v_cvt_pk_i16_f32 v5, v1, v2
 // GFX12: encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index 26f63102df9508..df3430f376f69e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1015,6 +1015,114 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_m
 v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3]
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+
+v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3]
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+
+v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+
+v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+
+v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+
 v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index de294b1ff2a22a..09dd6df618c5b6 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -570,6 +570,54 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
+v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1]
+// GFX12: encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
+
+v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+
+v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+
 v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
index e35bb632906722..7ee60262a5c1b4 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
@@ -396,6 +396,144 @@ v_ctz_i32_b32_e64 v5, src_scc
 v_ctz_i32_b32_e64 v255, 0xaf123456
 // GFX12: encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
+v_cvt_f32_bf8_e64 v1, s3
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], s3
+// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], 3
+// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], v3
+// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], s3
+// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], 3
+// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], v3
+// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], s3
+// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], s3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], 3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], v3
+// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], v3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], s3
+// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], 3
+// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], 3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], v3
+// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
 v_cvt_f16_f32_e64 v5, v1
 // GFX12: encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
index 6b915bd14683a2..808f941197c427 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
@@ -336,6 +336,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
+V_CVT_F32_FP8_e64_dpp v5, v1 quad_perm:[3,1,2,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0x27,0x00,0x2d]
+
+V_CVT_F32_FP8_e64_dpp v1, v3 quad_perm:[2,1,0,3] row_mask:0x5 bank_mask:0xe
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0xc6,0x00,0x5e]
+
+V_CVT_F32_BF8_e64_dpp v5, v1 quad_perm:[0,3,2,1] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0x6c,0x00,0x2d]
+
+V_CVT_F32_BF8_e64_dpp v1, v3 quad_perm:[0,1,3,2] row_mask:0x5 bank_mask:0xe
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0xb4,0x00,0x5e]
+
 v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
index 61266f3776c284..f7b51cfb6bda8e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
@@ -84,6 +84,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0xba,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
+v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
+
+v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
+
 v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
new file mode 100644
index 00000000000000..e1cd0cab663472
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], 1, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], s0, v1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], 1, v1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], 1, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3], v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], 1.0, v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0, v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3], v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0, v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0, v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3], v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], 1.0, v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0, v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3], v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0, v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0, v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], 1, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], s0, v[1:2], v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1], v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], 1, v[1:2], v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1, v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], 1, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
new file mode 100644
index 00000000000000..8bd9e5039b7207
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], 1, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], 1, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1], v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], 1.0, v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0, v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1], v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0, v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0, v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1], v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], 1.0, v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0, v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1], v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0, v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0, v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], 1, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], s0, v1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], 1, v1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], 1, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt
index 907d0c319efd57..259cb9dbc52a46 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt
@@ -14,6 +14,7 @@
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=neoverse-n2 --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1 --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1a --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1b --disassemble < %s | FileCheck %s
 
 # CHECK: ldaprb w0, [x0]
 # CHECK: ldaprh w0, [x0]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt
index a839f03c42ba19..39bb7338c80748 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt
@@ -397,6 +397,42 @@
 # GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf
 
+# GFX12: v_cvt_f32_bf8_e32 v1, s3                ; encoding: [0x03,0xda,0x02,0x7e]
+0x03,0xda,0x02,0x7e
+
+# GFX12: v_cvt_f32_bf8_e32 v1, 3                 ; encoding: [0x83,0xda,0x02,0x7e]
+0x83,0xda,0x02,0x7e
+
+# GFX12: v_cvt_f32_bf8_e32 v1, v3                ; encoding: [0x03,0xdb,0x02,0x7e]
+0x03,0xdb,0x02,0x7e
+
+# GFX12: v_cvt_f32_fp8_e32 v1, s3                ; encoding: [0x03,0xd8,0x02,0x7e]
+0x03,0xd8,0x02,0x7e
+
+# GFX12: v_cvt_f32_fp8_e32 v1, 3                 ; encoding: [0x83,0xd8,0x02,0x7e]
+0x83,0xd8,0x02,0x7e
+
+# GFX12: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xd9,0x02,0x7e]
+0x03,0xd9,0x02,0x7e
+
+# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xde,0x04,0x7e]
+0x03,0xde,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xde,0x04,0x7e]
+0x83,0xde,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
+0x03,0xdf,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xdc,0x04,0x7e]
+0x03,0xdc,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xdc,0x04,0x7e]
+0x83,0xdc,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xdd,0x04,0x7e]
+0x03,0xdd,0x04,0x7e
+
 # GFX12: v_cvt_f16_f32_e32 v5, v1                ; encoding: [0x01,0x15,0x0a,0x7e]
 0x01,0x15,0x0a,0x7e
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
index bcb9ad9febb96d..5848333f41ef7c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
@@ -337,6 +337,18 @@
 # GFX12: v_ctz_i32_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30
 
+# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac]
+0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac
+
+# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e
+
+# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac]
+0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac
+
+# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e
+
 # GFX12: v_cvt_f16_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
index 928165997dd919..d42e9ae25039bc 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
@@ -49,6 +49,18 @@
 # GFX12: v_ctz_i32_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_f32_fp8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_cvt_f32_fp8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05]
+0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_cvt_f32_bf8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05]
+0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05
+
 # GFX12: v_cvt_f16_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index db690aa99e4ab7..f86903b8de44b7 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -993,6 +993,42 @@
 # GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf
 
+# GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
+0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20
+
+# GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
+0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00
+
+# GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
+0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20
+
+# GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
+0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
+0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
+0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20
+
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
+0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
+0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20
+
 # GFX12: v_cvt_pk_i16_f32 v5, v1, v2             ; encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 69f61c7eb8030f..1be1d6e91ad8a7 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -825,6 +825,114 @@
 # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
 
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
+0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
+0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+
 # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index a7f0183016147f..44b3f7594029fd 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -495,6 +495,54 @@
 # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
+0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+
 # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
index 4fe4284e8eb4e6..9a8368a65f3d37 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
@@ -396,6 +396,42 @@
 # GFX12: v_ctz_i32_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
 
+# GFX12: v_cvt_f32_bf8_e64 v1, s3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
+0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00
+
+# GFX12: v_cvt_f32_bf8_e64 v1, 3                 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
+0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00
+
+# GFX12: v_cvt_f32_bf8_e64 v1, v3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
+0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00
+
+# GFX12: v_cvt_f32_fp8_e64 v1, s3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
+0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00
+
+# GFX12: v_cvt_f32_fp8_e64 v1, 3                 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00
+
+# GFX12: v_cvt_f32_fp8_e64 v1, v3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00]
+0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00
+
 # GFX12: v_cvt_f16_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index e914d139e240e1..8af274e0b4028f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -336,6 +336,18 @@
 # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
 
+# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d]
+0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e]
+0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d]
+0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e]
+0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e
+
 # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index 2a4b677620d387..3d48d58c775b18 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -72,6 +72,18 @@
 # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
+0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
+0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
+0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
+0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05
+
 # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
new file mode 100644
index 00000000000000..5079d2f0896561
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x40,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x58,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x40,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x41,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x41,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x42,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x42,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x43,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x43,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x44,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] # sgpr src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1/*Invalid register, operand has 'VGPR_32' register class*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x18] # sgpr src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], 1/*Invalid immediate*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1/*Invalid immediate*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x46,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x46,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x48,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x48,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x47,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x47,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x49,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x49,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x4a,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x50,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x50,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x51,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x51,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x52,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x52,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x53,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x53,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x54,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] # sgpr src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] # sgpr src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x01,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] # sgpr src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18]
+
+[0x03,0x40,0x55,0xcc,0x81,0x02,0x2e,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], 1/*Invalid immediate*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x01,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2d,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1/*Invalid immediate*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x56,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x57,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x57,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x58,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x58,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x59,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x59,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x5a,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x18]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt
new file mode 100644
index 00000000000000..61700faa8e6075
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x40,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x40,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x41,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x41,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x42,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x42,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x43,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x43,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x44,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x46,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x46,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x47,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x47,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x48,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x48,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x49,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x49,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x4a,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x50,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x51,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x52,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x53,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x54,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x55,0xcc,0x81,0x02,0x1a,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x01,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x19,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x56,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x57,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x58,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x59,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x18]
diff --git a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
index 1a73178a1f6a7e..d6f10e96d4769f 100644
--- a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
@@ -116,14 +116,14 @@
 0x10 0x08 0x02 0x46 # CHECK: sel.s $f0, $f1, $f2
 0x35 0x10 0x64 0x00 # CHECK: seleqz $2, $3, $4
 0x37 0x10 0x64 0x00 # CHECK: selnez $2, $3, $4
-0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
-0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
 0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4
 0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4
+0x1d 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
+0x1d 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
+0x1e 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
+0x1e 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
 0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4
 0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4
-0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
-0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
 0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4
 0x14 0x10 0x04 0x46 # CHECK: seleqz.s $f0, $f2, $f4
 0x14 0x10 0x24 0x46 # CHECK: seleqz.d $f0, $f2, $f4
diff --git a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
index 53ea0258e1c4bc..e1ba009f3c4c8c 100644
--- a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
@@ -92,8 +92,8 @@
 0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
 0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1d # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: max.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
 0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
 0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
@@ -103,8 +103,8 @@
 0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
 0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: max.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
 0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
 0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
diff --git a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
index 9aeea45472aebb..a7dfbd209b4e48 100644
--- a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
@@ -140,15 +140,15 @@
 0x43 0x00 0x50 0xec # CHECK: lwupc $2, 268
 0x98 0x18 0x24 0x46 # CHECK: maddf.d $f2, $f3, $f4
 0x98 0x18 0x04 0x46 # CHECK: maddf.s $f2, $f3, $f4
-0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
-0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
+0x1e 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
+0x1e 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
 0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4
 0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4
 0x01 0x78 0x08 0x40 # CHECK: mfc0 $8, $15, 1
 0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4
 0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4
-0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
-0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
+0x1d 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
+0x1d 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
 0xda 0x10 0x64 0x00 # CHECK: mod $2, $3, $4
 0xdb 0x10 0x64 0x00 # CHECK: modu $2, $3, $4
 0x25 0x78 0xe0 0x03 # CHECK: move $15, $ra
diff --git a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
index 32b91c6c6842e1..0030e51d6c2387 100644
--- a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
@@ -111,8 +111,8 @@
 0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
 0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1d # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: max.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
 0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
 0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
@@ -122,8 +122,8 @@
 0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
 0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: max.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
 0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
 0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
diff --git a/llvm/test/MC/Mips/cpsetup.s b/llvm/test/MC/Mips/cpsetup.s
index 8e587aea3e7e69..4a027c6e796aea 100644
--- a/llvm/test/MC/Mips/cpsetup.s
+++ b/llvm/test/MC/Mips/cpsetup.s
@@ -4,8 +4,6 @@
 # RUN: llvm-mc -triple mips-unknown-linux -target-abi o32 %s | \
 # RUN:   FileCheck -check-prefixes=ASM,ASM-O32 %s
 
-# FIXME: Now we check .cpsetup expansion for `-mno-shared` case only.
-#        We also need to implement/check the `-mshared` case.
 # RUN: llvm-mc -triple mips64-unknown-linux -target-abi n32 -filetype=obj -o - %s | \
 # RUN:   llvm-objdump --no-print-imm-hex -d -r -z - | \
 # RUN:   FileCheck -check-prefixes=ALL,NXX,N32 %s
@@ -35,11 +33,16 @@ t1:
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
@@ -64,11 +67,16 @@ t2:
 
 # NXX-NEXT: move     $2, $gp
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, $2, __cerror
@@ -101,11 +109,16 @@ t3:
 
 # NXX-NEXT: move     $2, $gp
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: {{^ *0+}}38: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: {{^ *0+}}40: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 .text
+# N32-NEXT: {{^ *0+}}40: R_MIPS_GPREL16 .text
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: {{^ *0+}}3c: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: {{^ *0+}}44: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 .text
+# N32-NEXT: {{^ *0+}}44: R_MIPS_GPREL16 .text
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 # NXX-NEXT: nop
 # NXX-NEXT: sub $3, $3, $2
@@ -158,11 +171,16 @@ t5:
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
@@ -184,11 +202,16 @@ IMM_8 = 8
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
diff --git a/llvm/test/MC/Mips/forbidden-slot.s b/llvm/test/MC/Mips/forbidden-slot.s
new file mode 100644
index 00000000000000..da98e70561695f
--- /dev/null
+++ b/llvm/test/MC/Mips/forbidden-slot.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mc -assemble -mcpu=mips64r6 -arch=mips64el -filetype=obj %s -o tmp.o
+# RUN: llvm-objdump -d tmp.o | FileCheck %s --check-prefix=MIPSELR6
+
+# MIPSELR6:      0000000000000000 <aaa>:
+# MIPSELR6-NEXT: beqzc	$13, 0x0 <aaa>
+# MIPSELR6-NEXT: b	0x0 <aaa>
+# MIPSELR6:      0000000000000008 <bbb>:
+# MIPSELR6-NEXT: beqzc	$13, 0x8 <bbb>
+# MIPSELR6-NEXT: nop    <aaa>
+# MIPSELR6:      b	0x8 <bbb>
+	.set noreorder
+aaa:
+	beqzc $t1, aaa
+	b aaa
+	.set reorder
+bbb:
+	beqzc $t1, bbb
+	b bbb
diff --git a/llvm/test/MC/Mips/macro-la-pic.s b/llvm/test/MC/Mips/macro-la-pic.s
index 2303f34c35bcfe..1875952d80c4e7 100644
--- a/llvm/test/MC/Mips/macro-la-pic.s
+++ b/llvm/test/MC/Mips/macro-la-pic.s
@@ -255,3 +255,25 @@ la $25, 2f
 # XN32: lw $25, %got_disp(.Ltmp1)($gp)  # encoding: [0x8f,0x99,A,A]
 # XN32:                                 #   fixup A - offset: 0, value: %got_disp(.Ltmp1), kind: fixup_Mips_GOT_DISP
 2:
+
+la $2,.Lstr
+# O32:      lw      $2, %got(.Lstr)($gp)          # encoding: [0x8f,0x82,A,A]
+# O32-NEXT:                           #   fixup A - offset: 0, value: %got(.Lstr), kind: fixup_Mips_GOT
+# O32-NEXT: addiu   $2, $2, %lo(.Lstr)            # encoding: [0x24,0x42,A,A]
+# O32-NEXT:                           #   fixup A - offset: 0, value: %lo(.Lstr), kind: fixup_Mips_LO16
+
+# N32:      lw      $2, %got_disp(.Lstr)($gp)     # encoding: [0x8f,0x82,A,A]
+# N32-NEXT:                           #   fixup A - offset: 0, value: %got_disp(.Lstr), kind: fixup_Mips_GOT_DISP
+
+la $2,$str2
+# O32:      lw      $2, %got($str2)($gp)          # encoding: [0x8f,0x82,A,A]
+# O32-NEXT: #   fixup A - offset: 0, value: %got($str2), kind: fixup_Mips_GOT
+# O32-NEXT: addiu   $2, $2, %lo($str2)            # encoding: [0x24,0x42,A,A]
+# O32-NEXT: #   fixup A - offset: 0, value: %lo($str2), kind: fixup_Mips_LO16
+
+# N32:      lw      $2, %got_disp($str2)($gp)     # encoding: [0x8f,0x82,A,A]
+# N32-NEXT:                           #   fixup A - offset: 0, value: %got_disp($str2), kind: fixup_Mips_GOT_DISP
+
+.rodata
+.Lstr: .4byte 0
+$str2: .4byte 0
diff --git a/llvm/test/MC/Mips/mips32r6/relocations.s b/llvm/test/MC/Mips/mips32r6/relocations.s
index dfd75e633bc2f7..8d4464bbbed77a 100644
--- a/llvm/test/MC/Mips/mips32r6/relocations.s
+++ b/llvm/test/MC/Mips/mips32r6/relocations.s
@@ -52,17 +52,17 @@
 # CHECK-ELF: Relocations [
 # CHECK-ELF:     0x0 R_MIPS_PC19_S2 bar
 # CHECK-ELF:     0x4 R_MIPS_PC16 bar
-# CHECK-ELF:     0x8 R_MIPS_PC16 bar
-# CHECK-ELF:     0xC R_MIPS_PC21_S2 bar
-# CHECK-ELF:     0x10 R_MIPS_PC21_S2 bar
-# CHECK-ELF:     0x14 R_MIPS_PC26_S2 bar
-# CHECK-ELF:     0x18 R_MIPS_PC26_S2 bar
-# CHECK-ELF:     0x1C R_MIPS_PCHI16 bar
-# CHECK-ELF:     0x20 R_MIPS_PCLO16 bar
-# CHECK-ELF:     0x24 R_MIPS_PC19_S2 bar
-# CHECK-ELF:     0x28 R_MIPS_PC19_S2 bar
-# CHECK-ELF:     0x2C R_MIPS_LO16 bar
-# CHECK-ELF:     0x30 R_MIPS_LO16 bar
+# CHECK-ELF:     0xC R_MIPS_PC16 bar
+# CHECK-ELF:     0x14 R_MIPS_PC21_S2 bar
+# CHECK-ELF:     0x1C R_MIPS_PC21_S2 bar
+# CHECK-ELF:     0x24 R_MIPS_PC26_S2 bar
+# CHECK-ELF:     0x28 R_MIPS_PC26_S2 bar
+# CHECK-ELF:     0x2C R_MIPS_PCHI16 bar
+# CHECK-ELF:     0x30 R_MIPS_PCLO16 bar
+# CHECK-ELF:     0x34 R_MIPS_PC19_S2 bar
+# CHECK-ELF:     0x38 R_MIPS_PC19_S2 bar
+# CHECK-ELF:     0x3C R_MIPS_LO16 bar
+# CHECK-ELF:     0x40 R_MIPS_LO16 bar
 # CHECK-ELF: ]
 
   addiupc   $2,bar
diff --git a/llvm/test/MC/Mips/mips32r6/valid.s b/llvm/test/MC/Mips/mips32r6/valid.s
index 0f098a176a67cc..0d705b6f242615 100644
--- a/llvm/test/MC/Mips/mips32r6/valid.s
+++ b/llvm/test/MC/Mips/mips32r6/valid.s
@@ -170,14 +170,14 @@ a:
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
         seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
         selnez  $2,$3,$4         # CHECK: selnez $2, $3, $4 # encoding: [0x00,0x64,0x10,0x37]
-        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
-        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
+        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
+        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
         min.s   $f0, $f2, $f4    # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c]
         min.d   $f0, $f2, $f4    # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c]
         maxa.s  $f0, $f2, $f4    # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f]
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
-        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
-        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
+        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
         or      $2, 4            # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         seleqz.s $f0, $f2, $f4   # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14]
         seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
diff --git a/llvm/test/MC/Mips/mips64r6/relocations.s b/llvm/test/MC/Mips/mips64r6/relocations.s
index 8353ec019a3ca8..8b02be37284aaa 100644
--- a/llvm/test/MC/Mips/mips64r6/relocations.s
+++ b/llvm/test/MC/Mips/mips64r6/relocations.s
@@ -59,19 +59,19 @@
 # CHECK-ELF: Relocations [
 # CHECK-ELF:     0x0 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
 # CHECK-ELF:     0x4 R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x8 R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0xC R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x10 R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x14 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x18 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x1C R_MIPS_PCHI16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x20 R_MIPS_PCLO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x24 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x28 R_MIPS_PC18_S3/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x2C R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x30 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x34 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x38 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0xC R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x14 R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x1C R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x24 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x28 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x2C R_MIPS_PCHI16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x30 R_MIPS_PCLO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x34 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x38 R_MIPS_PC18_S3/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x3C R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x40 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x44 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x48 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
 # CHECK-ELF: ]
 
   addiupc   $2,bar
diff --git a/llvm/test/MC/Mips/mips64r6/valid.s b/llvm/test/MC/Mips/mips64r6/valid.s
index c50bd9e31c232e..ff6e1d73fbeb48 100644
--- a/llvm/test/MC/Mips/mips64r6/valid.s
+++ b/llvm/test/MC/Mips/mips64r6/valid.s
@@ -183,14 +183,14 @@ a:
         lwupc   $2,268           # CHECK: lwupc $2, 268    # encoding: [0xec,0x50,0x00,0x43]
         maddf.d $f2,$f3,$f4      # CHECK: maddf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x98]
         maddf.s $f2,$f3,$f4      # CHECK: maddf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x98]
-        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
-        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
+        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
         maxa.s  $f0, $f2, $f4    # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f]
         min.d   $f0, $f2, $f4    # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c]
         min.s   $f0, $f2, $f4    # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c]
-        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
-        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
+        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
+        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
         mfc0    $8,$15,1         # CHECK: mfc0 $8, $15, 1      # encoding: [0x40,0x08,0x78,0x01]
         mod     $2,$3,$4         # CHECK: mod $2, $3, $4   # encoding: [0x00,0x64,0x10,0xda]
         modu    $2,$3,$4         # CHECK: modu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xdb]
diff --git a/llvm/test/MC/Mips/relocation.s b/llvm/test/MC/Mips/relocation.s
index 9c8bb657ea68c1..a92c62744fcaa5 100644
--- a/llvm/test/MC/Mips/relocation.s
+++ b/llvm/test/MC/Mips/relocation.s
@@ -237,7 +237,7 @@ baz:    .long foo                          // RELOC: R_MIPS_32 foo
                                            // ENCLE: addiu $2, $3, %tprel_lo(foo) # encoding: [A,A,0x62,0x24]
                                            // FIXUP: # fixup A - offset: 0, value: %tprel_lo(foo), kind: fixup_Mips_TPREL_LO
 
-// DATA-NEXT:  00C0: D85FFFFF CBFFFFFF EC580000 EC480000
+// DATA-NEXT:  00C0: D85FFFFF 00000000 CBFFFFFF EC580000
                                            // ?????: R_MIPS_GLOB_DAT foo
         .set mips32r6
         beqzc $2, foo                      // RELOC: R_MIPS_PC21_S2 foo
@@ -262,7 +262,7 @@ baz:    .long foo                          // RELOC: R_MIPS_32 foo
                                            // ENCLE: lwpc $2, foo # encoding: [A,A,0b01001AAA,0xec]
                                            // FIXUP: # fixup A - offset: 0, value: foo, kind: fixup_MIPS_PC19_S2
 
-// DATA-NEXT:  00D0: 24620000 24620000 00000000
+// DATA-NEXT:  00D0: EC480000 24620000 24620000 00000000
         addiu $2, $3, %pcrel_hi(foo)       // RELOC: R_MIPS_PCHI16 foo
                                            // ENCBE: addiu $2, $3, %pcrel_hi(foo) # encoding: [0x24,0x62,A,A]
                                            // ENCLE: addiu $2, $3, %pcrel_hi(foo) # encoding: [A,A,0x62,0x24]
diff --git a/llvm/test/MC/RISCV/rv32zicond-invalid.s b/llvm/test/MC/RISCV/rv32zicond-invalid.s
index a350593993b525..02f5d1777b0ebc 100644
--- a/llvm/test/MC/RISCV/rv32zicond-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zicond-invalid.s
@@ -1,5 +1,5 @@
-# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zicond < %s 2>&1 | FileCheck %s
-# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zicond < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple riscv32 -mattr=+zicond < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple riscv64 -mattr=+zicond < %s 2>&1 | FileCheck %s
 
 # Use of operand modifier on register name
 czero.eqz t1, %lo(t2), t3 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zicond-valid.s b/llvm/test/MC/RISCV/rv32zicond-valid.s
index e6deb81301eca7..c862f04b806788 100644
--- a/llvm/test/MC/RISCV/rv32zicond-valid.s
+++ b/llvm/test/MC/RISCV/rv32zicond-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicond -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zicond -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicond -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zicond -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zicond < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zicond -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zicond < %s \
+# RUN:     | llvm-objdump --mattr=+zicond -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zicond < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zicond -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zicond < %s \
+# RUN:     | llvm-objdump --mattr=+zicond -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 
 # CHECK-ASM-AND-OBJ: czero.eqz t0, a3, ra
diff --git a/llvm/test/TableGen/MacroFusion.td b/llvm/test/TableGen/MacroFusion.td
index 4aa6c8d9acb273..ce76e7f0f7fa64 100644
--- a/llvm/test/TableGen/MacroFusion.td
+++ b/llvm/test/TableGen/MacroFusion.td
@@ -34,6 +34,11 @@ let Namespace = "Test" in {
 def Inst0 : TestInst<0>;
 def Inst1 : TestInst<1>;
 
+def BothFusionPredicate: BothFusionPredicateWithMCInstPredicate<CheckRegOperand<0, X0>>;
+def TestBothFusionPredicate: Fusion<"test-both-fusion-predicate", "HasBothFusionPredicate",
+                                    "Test BothFusionPredicate",
+                                    [BothFusionPredicate]>;
+
 def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
                              CheckOpcode<[Inst0]>,
                              CheckAll<[
@@ -45,6 +50,7 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:  #undef GET_Test_MACRO_FUSION_PRED_DECL
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  namespace llvm {
+// CHECK-PREDICATOR-NEXT:  bool isTestBothFusionPredicate(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
 // CHECK-PREDICATOR-NEXT:  bool isTestFusion(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
 // CHECK-PREDICATOR-NEXT:  } // end namespace llvm
 // CHECK-PREDICATOR-EMPTY:
@@ -54,6 +60,24 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:  #undef GET_Test_MACRO_FUSION_PRED_IMPL
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  namespace llvm {
+// CHECK-PREDICATOR-NEXT:  bool isTestBothFusionPredicate(
+// CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
+// CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *FirstMI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr &SecondMI) {
+// CHECK-PREDICATOR-NEXT:    auto &MRI = SecondMI.getMF()->getRegInfo();
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = FirstMI;
+// CHECK-PREDICATOR-NEXT:      if (MI->getOperand(0).getReg() != Test::X0)
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = &SecondMI;
+// CHECK-PREDICATOR-NEXT:      if (MI->getOperand(0).getReg() != Test::X0)
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    return true;
+// CHECK-PREDICATOR-NEXT:  }
 // CHECK-PREDICATOR-NEXT:  bool isTestFusion(
 // CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
 // CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
@@ -106,6 +130,7 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 
 // CHECK-SUBTARGET:      std::vector<MacroFusionPredTy> TestGenSubtargetInfo::getMacroFusions() const {
 // CHECK-SUBTARGET-NEXT:   std::vector<MacroFusionPredTy> Fusions;
+// CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestBothFusionPredicate)) Fusions.push_back(llvm::isTestBothFusionPredicate); 
 // CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestFusion)) Fusions.push_back(llvm::isTestFusion);
 // CHECK-SUBTARGET-NEXT:   return Fusions;
 // CHECK-SUBTARGET-NEXT: }
diff --git a/llvm/test/TableGen/address-space-patfrags.td b/llvm/test/TableGen/address-space-patfrags.td
index 4aec6ea7e0eae8..46050a70720fbe 100644
--- a/llvm/test/TableGen/address-space-patfrags.td
+++ b/llvm/test/TableGen/address-space-patfrags.td
@@ -46,7 +46,7 @@ def inst_d : Instruction {
   let InOperandList = (ins GPR32:$src0, GPR32:$src1);
 }
 
-// SDAG: case 1: {
+// SDAG: case 0: {
 // SDAG-NEXT: // Predicate_pat_frag_b
 // SDAG-NEXT: // Predicate_truncstorei16_addrspace
 // SDAG-NEXT: SDNode *N = Node;
@@ -69,7 +69,7 @@ def : Pat <
 >;
 
 
-// SDAG: case 6: {
+// SDAG: case 4: {
 // SDAG: // Predicate_pat_frag_a
 // SDAG-NEXT: SDNode *N = Node;
 // SDAG-NEXT: (void)N;
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 9b12e4af00bf74..d6d96737695011 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -6608,15 +6608,15 @@ static const X86FoldTableEntry Table4[] = {
 };
 
 static const X86FoldTableEntry BroadcastTable1[] = {
-  {X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rmb, TB_BCAST_SD},
-  {X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rmb, TB_BCAST_SD},
-  {X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrmb, TB_BCAST_SD},
-  {X86::VCVTDQ2PHZ128rr, X86::VCVTDQ2PHZ128rmb, TB_BCAST_SH},
-  {X86::VCVTDQ2PHZ256rr, X86::VCVTDQ2PHZ256rmb, TB_BCAST_SH},
-  {X86::VCVTDQ2PHZrr, X86::VCVTDQ2PHZrmb, TB_BCAST_SH},
-  {X86::VCVTDQ2PSZ128rr, X86::VCVTDQ2PSZ128rmb, TB_BCAST_SS},
-  {X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rmb, TB_BCAST_SS},
-  {X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrmb, TB_BCAST_SS},
+  {X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rmb, TB_BCAST_D},
+  {X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rmb, TB_BCAST_D},
+  {X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrmb, TB_BCAST_D},
+  {X86::VCVTDQ2PHZ128rr, X86::VCVTDQ2PHZ128rmb, TB_BCAST_D},
+  {X86::VCVTDQ2PHZ256rr, X86::VCVTDQ2PHZ256rmb, TB_BCAST_D},
+  {X86::VCVTDQ2PHZrr, X86::VCVTDQ2PHZrmb, TB_BCAST_D},
+  {X86::VCVTDQ2PSZ128rr, X86::VCVTDQ2PSZ128rmb, TB_BCAST_D},
+  {X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rmb, TB_BCAST_D},
+  {X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrmb, TB_BCAST_D},
   {X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rmb, TB_BCAST_SS},
   {X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rmb, TB_BCAST_SS},
   {X86::VCVTNEPS2BF16Zrr, X86::VCVTNEPS2BF16Zrmb, TB_BCAST_SS},
@@ -6626,9 +6626,9 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VCVTPD2PHZ128rr, X86::VCVTPD2PHZ128rmb, TB_BCAST_SD},
   {X86::VCVTPD2PHZ256rr, X86::VCVTPD2PHZ256rmb, TB_BCAST_SD},
   {X86::VCVTPD2PHZrr, X86::VCVTPD2PHZrmb, TB_BCAST_SD},
-  {X86::VCVTPD2PSZ128rr, X86::VCVTPD2PSZ128rmb, TB_BCAST_SS},
-  {X86::VCVTPD2PSZ256rr, X86::VCVTPD2PSZ256rmb, TB_BCAST_SS},
-  {X86::VCVTPD2PSZrr, X86::VCVTPD2PSZrmb, TB_BCAST_SS},
+  {X86::VCVTPD2PSZ128rr, X86::VCVTPD2PSZ128rmb, TB_BCAST_SD},
+  {X86::VCVTPD2PSZ256rr, X86::VCVTPD2PSZ256rmb, TB_BCAST_SD},
+  {X86::VCVTPD2PSZrr, X86::VCVTPD2PSZrmb, TB_BCAST_SD},
   {X86::VCVTPD2QQZ128rr, X86::VCVTPD2QQZ128rmb, TB_BCAST_SD},
   {X86::VCVTPD2QQZ256rr, X86::VCVTPD2QQZ256rmb, TB_BCAST_SD},
   {X86::VCVTPD2QQZrr, X86::VCVTPD2QQZrmb, TB_BCAST_SD},
@@ -6680,15 +6680,15 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VCVTPS2UQQZ128rr, X86::VCVTPS2UQQZ128rmb, TB_BCAST_SS},
   {X86::VCVTPS2UQQZ256rr, X86::VCVTPS2UQQZ256rmb, TB_BCAST_SS},
   {X86::VCVTPS2UQQZrr, X86::VCVTPS2UQQZrmb, TB_BCAST_SS},
-  {X86::VCVTQQ2PDZ128rr, X86::VCVTQQ2PDZ128rmb, TB_BCAST_SD},
-  {X86::VCVTQQ2PDZ256rr, X86::VCVTQQ2PDZ256rmb, TB_BCAST_SD},
-  {X86::VCVTQQ2PDZrr, X86::VCVTQQ2PDZrmb, TB_BCAST_SD},
-  {X86::VCVTQQ2PHZ128rr, X86::VCVTQQ2PHZ128rmb, TB_BCAST_SH},
-  {X86::VCVTQQ2PHZ256rr, X86::VCVTQQ2PHZ256rmb, TB_BCAST_SH},
-  {X86::VCVTQQ2PHZrr, X86::VCVTQQ2PHZrmb, TB_BCAST_SH},
-  {X86::VCVTQQ2PSZ128rr, X86::VCVTQQ2PSZ128rmb, TB_BCAST_SS},
-  {X86::VCVTQQ2PSZ256rr, X86::VCVTQQ2PSZ256rmb, TB_BCAST_SS},
-  {X86::VCVTQQ2PSZrr, X86::VCVTQQ2PSZrmb, TB_BCAST_SS},
+  {X86::VCVTQQ2PDZ128rr, X86::VCVTQQ2PDZ128rmb, TB_BCAST_Q},
+  {X86::VCVTQQ2PDZ256rr, X86::VCVTQQ2PDZ256rmb, TB_BCAST_Q},
+  {X86::VCVTQQ2PDZrr, X86::VCVTQQ2PDZrmb, TB_BCAST_Q},
+  {X86::VCVTQQ2PHZ128rr, X86::VCVTQQ2PHZ128rmb, TB_BCAST_Q},
+  {X86::VCVTQQ2PHZ256rr, X86::VCVTQQ2PHZ256rmb, TB_BCAST_Q},
+  {X86::VCVTQQ2PHZrr, X86::VCVTQQ2PHZrmb, TB_BCAST_Q},
+  {X86::VCVTQQ2PSZ128rr, X86::VCVTQQ2PSZ128rmb, TB_BCAST_Q},
+  {X86::VCVTQQ2PSZ256rr, X86::VCVTQQ2PSZ256rmb, TB_BCAST_Q},
+  {X86::VCVTQQ2PSZrr, X86::VCVTQQ2PSZrmb, TB_BCAST_Q},
   {X86::VCVTTPD2DQZ128rr, X86::VCVTTPD2DQZ128rmb, TB_BCAST_SD},
   {X86::VCVTTPD2DQZ256rr, X86::VCVTTPD2DQZ256rmb, TB_BCAST_SD},
   {X86::VCVTTPD2DQZrr, X86::VCVTTPD2DQZrmb, TB_BCAST_SD},
@@ -6731,30 +6731,30 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VCVTTPS2UQQZ128rr, X86::VCVTTPS2UQQZ128rmb, TB_BCAST_SS},
   {X86::VCVTTPS2UQQZ256rr, X86::VCVTTPS2UQQZ256rmb, TB_BCAST_SS},
   {X86::VCVTTPS2UQQZrr, X86::VCVTTPS2UQQZrmb, TB_BCAST_SS},
-  {X86::VCVTUDQ2PDZ128rr, X86::VCVTUDQ2PDZ128rmb, TB_BCAST_SD},
-  {X86::VCVTUDQ2PDZ256rr, X86::VCVTUDQ2PDZ256rmb, TB_BCAST_SD},
-  {X86::VCVTUDQ2PDZrr, X86::VCVTUDQ2PDZrmb, TB_BCAST_SD},
-  {X86::VCVTUDQ2PHZ128rr, X86::VCVTUDQ2PHZ128rmb, TB_BCAST_SH},
-  {X86::VCVTUDQ2PHZ256rr, X86::VCVTUDQ2PHZ256rmb, TB_BCAST_SH},
-  {X86::VCVTUDQ2PHZrr, X86::VCVTUDQ2PHZrmb, TB_BCAST_SH},
-  {X86::VCVTUDQ2PSZ128rr, X86::VCVTUDQ2PSZ128rmb, TB_BCAST_SS},
-  {X86::VCVTUDQ2PSZ256rr, X86::VCVTUDQ2PSZ256rmb, TB_BCAST_SS},
-  {X86::VCVTUDQ2PSZrr, X86::VCVTUDQ2PSZrmb, TB_BCAST_SS},
-  {X86::VCVTUQQ2PDZ128rr, X86::VCVTUQQ2PDZ128rmb, TB_BCAST_SD},
-  {X86::VCVTUQQ2PDZ256rr, X86::VCVTUQQ2PDZ256rmb, TB_BCAST_SD},
-  {X86::VCVTUQQ2PDZrr, X86::VCVTUQQ2PDZrmb, TB_BCAST_SD},
-  {X86::VCVTUQQ2PHZ128rr, X86::VCVTUQQ2PHZ128rmb, TB_BCAST_SH},
-  {X86::VCVTUQQ2PHZ256rr, X86::VCVTUQQ2PHZ256rmb, TB_BCAST_SH},
-  {X86::VCVTUQQ2PHZrr, X86::VCVTUQQ2PHZrmb, TB_BCAST_SH},
-  {X86::VCVTUQQ2PSZ128rr, X86::VCVTUQQ2PSZ128rmb, TB_BCAST_SS},
-  {X86::VCVTUQQ2PSZ256rr, X86::VCVTUQQ2PSZ256rmb, TB_BCAST_SS},
-  {X86::VCVTUQQ2PSZrr, X86::VCVTUQQ2PSZrmb, TB_BCAST_SS},
-  {X86::VCVTUW2PHZ128rr, X86::VCVTUW2PHZ128rmb, TB_BCAST_SH},
-  {X86::VCVTUW2PHZ256rr, X86::VCVTUW2PHZ256rmb, TB_BCAST_SH},
-  {X86::VCVTUW2PHZrr, X86::VCVTUW2PHZrmb, TB_BCAST_SH},
-  {X86::VCVTW2PHZ128rr, X86::VCVTW2PHZ128rmb, TB_BCAST_SH},
-  {X86::VCVTW2PHZ256rr, X86::VCVTW2PHZ256rmb, TB_BCAST_SH},
-  {X86::VCVTW2PHZrr, X86::VCVTW2PHZrmb, TB_BCAST_SH},
+  {X86::VCVTUDQ2PDZ128rr, X86::VCVTUDQ2PDZ128rmb, TB_BCAST_D},
+  {X86::VCVTUDQ2PDZ256rr, X86::VCVTUDQ2PDZ256rmb, TB_BCAST_D},
+  {X86::VCVTUDQ2PDZrr, X86::VCVTUDQ2PDZrmb, TB_BCAST_D},
+  {X86::VCVTUDQ2PHZ128rr, X86::VCVTUDQ2PHZ128rmb, TB_BCAST_D},
+  {X86::VCVTUDQ2PHZ256rr, X86::VCVTUDQ2PHZ256rmb, TB_BCAST_D},
+  {X86::VCVTUDQ2PHZrr, X86::VCVTUDQ2PHZrmb, TB_BCAST_D},
+  {X86::VCVTUDQ2PSZ128rr, X86::VCVTUDQ2PSZ128rmb, TB_BCAST_D},
+  {X86::VCVTUDQ2PSZ256rr, X86::VCVTUDQ2PSZ256rmb, TB_BCAST_D},
+  {X86::VCVTUDQ2PSZrr, X86::VCVTUDQ2PSZrmb, TB_BCAST_D},
+  {X86::VCVTUQQ2PDZ128rr, X86::VCVTUQQ2PDZ128rmb, TB_BCAST_Q},
+  {X86::VCVTUQQ2PDZ256rr, X86::VCVTUQQ2PDZ256rmb, TB_BCAST_Q},
+  {X86::VCVTUQQ2PDZrr, X86::VCVTUQQ2PDZrmb, TB_BCAST_Q},
+  {X86::VCVTUQQ2PHZ128rr, X86::VCVTUQQ2PHZ128rmb, TB_BCAST_Q},
+  {X86::VCVTUQQ2PHZ256rr, X86::VCVTUQQ2PHZ256rmb, TB_BCAST_Q},
+  {X86::VCVTUQQ2PHZrr, X86::VCVTUQQ2PHZrmb, TB_BCAST_Q},
+  {X86::VCVTUQQ2PSZ128rr, X86::VCVTUQQ2PSZ128rmb, TB_BCAST_Q},
+  {X86::VCVTUQQ2PSZ256rr, X86::VCVTUQQ2PSZ256rmb, TB_BCAST_Q},
+  {X86::VCVTUQQ2PSZrr, X86::VCVTUQQ2PSZrmb, TB_BCAST_Q},
+  {X86::VCVTUW2PHZ128rr, X86::VCVTUW2PHZ128rmb, TB_BCAST_W},
+  {X86::VCVTUW2PHZ256rr, X86::VCVTUW2PHZ256rmb, TB_BCAST_W},
+  {X86::VCVTUW2PHZrr, X86::VCVTUW2PHZrmb, TB_BCAST_W},
+  {X86::VCVTW2PHZ128rr, X86::VCVTW2PHZ128rmb, TB_BCAST_W},
+  {X86::VCVTW2PHZ256rr, X86::VCVTW2PHZ256rmb, TB_BCAST_W},
+  {X86::VCVTW2PHZrr, X86::VCVTW2PHZrmb, TB_BCAST_W},
   {X86::VEXP2PDZr, X86::VEXP2PDZmb, TB_BCAST_SD},
   {X86::VEXP2PSZr, X86::VEXP2PSZmb, TB_BCAST_SS},
   {X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rmb, TB_BCAST_SD},
@@ -6945,15 +6945,15 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS},
   {X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS},
   {X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS},
-  {X86::VCVTDQ2PDZ128rrkz, X86::VCVTDQ2PDZ128rmbkz, TB_BCAST_SD},
-  {X86::VCVTDQ2PDZ256rrkz, X86::VCVTDQ2PDZ256rmbkz, TB_BCAST_SD},
-  {X86::VCVTDQ2PDZrrkz, X86::VCVTDQ2PDZrmbkz, TB_BCAST_SD},
-  {X86::VCVTDQ2PHZ128rrkz, X86::VCVTDQ2PHZ128rmbkz, TB_BCAST_SH},
-  {X86::VCVTDQ2PHZ256rrkz, X86::VCVTDQ2PHZ256rmbkz, TB_BCAST_SH},
-  {X86::VCVTDQ2PHZrrkz, X86::VCVTDQ2PHZrmbkz, TB_BCAST_SH},
-  {X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmbkz, TB_BCAST_SS},
-  {X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmbkz, TB_BCAST_SS},
-  {X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmbkz, TB_BCAST_SS},
+  {X86::VCVTDQ2PDZ128rrkz, X86::VCVTDQ2PDZ128rmbkz, TB_BCAST_D},
+  {X86::VCVTDQ2PDZ256rrkz, X86::VCVTDQ2PDZ256rmbkz, TB_BCAST_D},
+  {X86::VCVTDQ2PDZrrkz, X86::VCVTDQ2PDZrmbkz, TB_BCAST_D},
+  {X86::VCVTDQ2PHZ128rrkz, X86::VCVTDQ2PHZ128rmbkz, TB_BCAST_D},
+  {X86::VCVTDQ2PHZ256rrkz, X86::VCVTDQ2PHZ256rmbkz, TB_BCAST_D},
+  {X86::VCVTDQ2PHZrrkz, X86::VCVTDQ2PHZrmbkz, TB_BCAST_D},
+  {X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmbkz, TB_BCAST_D},
+  {X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmbkz, TB_BCAST_D},
+  {X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmbkz, TB_BCAST_D},
   {X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rmb, TB_BCAST_SS},
   {X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rmb, TB_BCAST_SS},
   {X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrmb, TB_BCAST_SS},
@@ -6966,9 +6966,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VCVTPD2PHZ128rrkz, X86::VCVTPD2PHZ128rmbkz, TB_BCAST_SD},
   {X86::VCVTPD2PHZ256rrkz, X86::VCVTPD2PHZ256rmbkz, TB_BCAST_SD},
   {X86::VCVTPD2PHZrrkz, X86::VCVTPD2PHZrmbkz, TB_BCAST_SD},
-  {X86::VCVTPD2PSZ128rrkz, X86::VCVTPD2PSZ128rmbkz, TB_BCAST_SS},
-  {X86::VCVTPD2PSZ256rrkz, X86::VCVTPD2PSZ256rmbkz, TB_BCAST_SS},
-  {X86::VCVTPD2PSZrrkz, X86::VCVTPD2PSZrmbkz, TB_BCAST_SS},
+  {X86::VCVTPD2PSZ128rrkz, X86::VCVTPD2PSZ128rmbkz, TB_BCAST_SD},
+  {X86::VCVTPD2PSZ256rrkz, X86::VCVTPD2PSZ256rmbkz, TB_BCAST_SD},
+  {X86::VCVTPD2PSZrrkz, X86::VCVTPD2PSZrmbkz, TB_BCAST_SD},
   {X86::VCVTPD2QQZ128rrkz, X86::VCVTPD2QQZ128rmbkz, TB_BCAST_SD},
   {X86::VCVTPD2QQZ256rrkz, X86::VCVTPD2QQZ256rmbkz, TB_BCAST_SD},
   {X86::VCVTPD2QQZrrkz, X86::VCVTPD2QQZrmbkz, TB_BCAST_SD},
@@ -7020,15 +7020,15 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VCVTPS2UQQZ128rrkz, X86::VCVTPS2UQQZ128rmbkz, TB_BCAST_SS},
   {X86::VCVTPS2UQQZ256rrkz, X86::VCVTPS2UQQZ256rmbkz, TB_BCAST_SS},
   {X86::VCVTPS2UQQZrrkz, X86::VCVTPS2UQQZrmbkz, TB_BCAST_SS},
-  {X86::VCVTQQ2PDZ128rrkz, X86::VCVTQQ2PDZ128rmbkz, TB_BCAST_SD},
-  {X86::VCVTQQ2PDZ256rrkz, X86::VCVTQQ2PDZ256rmbkz, TB_BCAST_SD},
-  {X86::VCVTQQ2PDZrrkz, X86::VCVTQQ2PDZrmbkz, TB_BCAST_SD},
-  {X86::VCVTQQ2PHZ128rrkz, X86::VCVTQQ2PHZ128rmbkz, TB_BCAST_SH},
-  {X86::VCVTQQ2PHZ256rrkz, X86::VCVTQQ2PHZ256rmbkz, TB_BCAST_SH},
-  {X86::VCVTQQ2PHZrrkz, X86::VCVTQQ2PHZrmbkz, TB_BCAST_SH},
-  {X86::VCVTQQ2PSZ128rrkz, X86::VCVTQQ2PSZ128rmbkz, TB_BCAST_SS},
-  {X86::VCVTQQ2PSZ256rrkz, X86::VCVTQQ2PSZ256rmbkz, TB_BCAST_SS},
-  {X86::VCVTQQ2PSZrrkz, X86::VCVTQQ2PSZrmbkz, TB_BCAST_SS},
+  {X86::VCVTQQ2PDZ128rrkz, X86::VCVTQQ2PDZ128rmbkz, TB_BCAST_Q},
+  {X86::VCVTQQ2PDZ256rrkz, X86::VCVTQQ2PDZ256rmbkz, TB_BCAST_Q},
+  {X86::VCVTQQ2PDZrrkz, X86::VCVTQQ2PDZrmbkz, TB_BCAST_Q},
+  {X86::VCVTQQ2PHZ128rrkz, X86::VCVTQQ2PHZ128rmbkz, TB_BCAST_Q},
+  {X86::VCVTQQ2PHZ256rrkz, X86::VCVTQQ2PHZ256rmbkz, TB_BCAST_Q},
+  {X86::VCVTQQ2PHZrrkz, X86::VCVTQQ2PHZrmbkz, TB_BCAST_Q},
+  {X86::VCVTQQ2PSZ128rrkz, X86::VCVTQQ2PSZ128rmbkz, TB_BCAST_Q},
+  {X86::VCVTQQ2PSZ256rrkz, X86::VCVTQQ2PSZ256rmbkz, TB_BCAST_Q},
+  {X86::VCVTQQ2PSZrrkz, X86::VCVTQQ2PSZrmbkz, TB_BCAST_Q},
   {X86::VCVTTPD2DQZ128rrkz, X86::VCVTTPD2DQZ128rmbkz, TB_BCAST_SD},
   {X86::VCVTTPD2DQZ256rrkz, X86::VCVTTPD2DQZ256rmbkz, TB_BCAST_SD},
   {X86::VCVTTPD2DQZrrkz, X86::VCVTTPD2DQZrmbkz, TB_BCAST_SD},
@@ -7071,30 +7071,30 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VCVTTPS2UQQZ128rrkz, X86::VCVTTPS2UQQZ128rmbkz, TB_BCAST_SS},
   {X86::VCVTTPS2UQQZ256rrkz, X86::VCVTTPS2UQQZ256rmbkz, TB_BCAST_SS},
   {X86::VCVTTPS2UQQZrrkz, X86::VCVTTPS2UQQZrmbkz, TB_BCAST_SS},
-  {X86::VCVTUDQ2PDZ128rrkz, X86::VCVTUDQ2PDZ128rmbkz, TB_BCAST_SD},
-  {X86::VCVTUDQ2PDZ256rrkz, X86::VCVTUDQ2PDZ256rmbkz, TB_BCAST_SD},
-  {X86::VCVTUDQ2PDZrrkz, X86::VCVTUDQ2PDZrmbkz, TB_BCAST_SD},
-  {X86::VCVTUDQ2PHZ128rrkz, X86::VCVTUDQ2PHZ128rmbkz, TB_BCAST_SH},
-  {X86::VCVTUDQ2PHZ256rrkz, X86::VCVTUDQ2PHZ256rmbkz, TB_BCAST_SH},
-  {X86::VCVTUDQ2PHZrrkz, X86::VCVTUDQ2PHZrmbkz, TB_BCAST_SH},
-  {X86::VCVTUDQ2PSZ128rrkz, X86::VCVTUDQ2PSZ128rmbkz, TB_BCAST_SS},
-  {X86::VCVTUDQ2PSZ256rrkz, X86::VCVTUDQ2PSZ256rmbkz, TB_BCAST_SS},
-  {X86::VCVTUDQ2PSZrrkz, X86::VCVTUDQ2PSZrmbkz, TB_BCAST_SS},
-  {X86::VCVTUQQ2PDZ128rrkz, X86::VCVTUQQ2PDZ128rmbkz, TB_BCAST_SD},
-  {X86::VCVTUQQ2PDZ256rrkz, X86::VCVTUQQ2PDZ256rmbkz, TB_BCAST_SD},
-  {X86::VCVTUQQ2PDZrrkz, X86::VCVTUQQ2PDZrmbkz, TB_BCAST_SD},
-  {X86::VCVTUQQ2PHZ128rrkz, X86::VCVTUQQ2PHZ128rmbkz, TB_BCAST_SH},
-  {X86::VCVTUQQ2PHZ256rrkz, X86::VCVTUQQ2PHZ256rmbkz, TB_BCAST_SH},
-  {X86::VCVTUQQ2PHZrrkz, X86::VCVTUQQ2PHZrmbkz, TB_BCAST_SH},
-  {X86::VCVTUQQ2PSZ128rrkz, X86::VCVTUQQ2PSZ128rmbkz, TB_BCAST_SS},
-  {X86::VCVTUQQ2PSZ256rrkz, X86::VCVTUQQ2PSZ256rmbkz, TB_BCAST_SS},
-  {X86::VCVTUQQ2PSZrrkz, X86::VCVTUQQ2PSZrmbkz, TB_BCAST_SS},
-  {X86::VCVTUW2PHZ128rrkz, X86::VCVTUW2PHZ128rmbkz, TB_BCAST_SH},
-  {X86::VCVTUW2PHZ256rrkz, X86::VCVTUW2PHZ256rmbkz, TB_BCAST_SH},
-  {X86::VCVTUW2PHZrrkz, X86::VCVTUW2PHZrmbkz, TB_BCAST_SH},
-  {X86::VCVTW2PHZ128rrkz, X86::VCVTW2PHZ128rmbkz, TB_BCAST_SH},
-  {X86::VCVTW2PHZ256rrkz, X86::VCVTW2PHZ256rmbkz, TB_BCAST_SH},
-  {X86::VCVTW2PHZrrkz, X86::VCVTW2PHZrmbkz, TB_BCAST_SH},
+  {X86::VCVTUDQ2PDZ128rrkz, X86::VCVTUDQ2PDZ128rmbkz, TB_BCAST_D},
+  {X86::VCVTUDQ2PDZ256rrkz, X86::VCVTUDQ2PDZ256rmbkz, TB_BCAST_D},
+  {X86::VCVTUDQ2PDZrrkz, X86::VCVTUDQ2PDZrmbkz, TB_BCAST_D},
+  {X86::VCVTUDQ2PHZ128rrkz, X86::VCVTUDQ2PHZ128rmbkz, TB_BCAST_D},
+  {X86::VCVTUDQ2PHZ256rrkz, X86::VCVTUDQ2PHZ256rmbkz, TB_BCAST_D},
+  {X86::VCVTUDQ2PHZrrkz, X86::VCVTUDQ2PHZrmbkz, TB_BCAST_D},
+  {X86::VCVTUDQ2PSZ128rrkz, X86::VCVTUDQ2PSZ128rmbkz, TB_BCAST_D},
+  {X86::VCVTUDQ2PSZ256rrkz, X86::VCVTUDQ2PSZ256rmbkz, TB_BCAST_D},
+  {X86::VCVTUDQ2PSZrrkz, X86::VCVTUDQ2PSZrmbkz, TB_BCAST_D},
+  {X86::VCVTUQQ2PDZ128rrkz, X86::VCVTUQQ2PDZ128rmbkz, TB_BCAST_Q},
+  {X86::VCVTUQQ2PDZ256rrkz, X86::VCVTUQQ2PDZ256rmbkz, TB_BCAST_Q},
+  {X86::VCVTUQQ2PDZrrkz, X86::VCVTUQQ2PDZrmbkz, TB_BCAST_Q},
+  {X86::VCVTUQQ2PHZ128rrkz, X86::VCVTUQQ2PHZ128rmbkz, TB_BCAST_Q},
+  {X86::VCVTUQQ2PHZ256rrkz, X86::VCVTUQQ2PHZ256rmbkz, TB_BCAST_Q},
+  {X86::VCVTUQQ2PHZrrkz, X86::VCVTUQQ2PHZrmbkz, TB_BCAST_Q},
+  {X86::VCVTUQQ2PSZ128rrkz, X86::VCVTUQQ2PSZ128rmbkz, TB_BCAST_Q},
+  {X86::VCVTUQQ2PSZ256rrkz, X86::VCVTUQQ2PSZ256rmbkz, TB_BCAST_Q},
+  {X86::VCVTUQQ2PSZrrkz, X86::VCVTUQQ2PSZrmbkz, TB_BCAST_Q},
+  {X86::VCVTUW2PHZ128rrkz, X86::VCVTUW2PHZ128rmbkz, TB_BCAST_W},
+  {X86::VCVTUW2PHZ256rrkz, X86::VCVTUW2PHZ256rmbkz, TB_BCAST_W},
+  {X86::VCVTUW2PHZrrkz, X86::VCVTUW2PHZrmbkz, TB_BCAST_W},
+  {X86::VCVTW2PHZ128rrkz, X86::VCVTW2PHZ128rmbkz, TB_BCAST_W},
+  {X86::VCVTW2PHZ256rrkz, X86::VCVTW2PHZ256rmbkz, TB_BCAST_W},
+  {X86::VCVTW2PHZrrkz, X86::VCVTW2PHZrmbkz, TB_BCAST_W},
   {X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD},
   {X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD},
   {X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD},
@@ -7148,9 +7148,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rmb, TB_BCAST_SD},
   {X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rmb, TB_BCAST_SD},
   {X86::VMAXCPDZrr, X86::VMAXCPDZrmb, TB_BCAST_SD},
-  {X86::VMAXCPHZ128rr, X86::VMAXCPHZ128rmb, TB_BCAST_SS},
-  {X86::VMAXCPHZ256rr, X86::VMAXCPHZ256rmb, TB_BCAST_SS},
-  {X86::VMAXCPHZrr, X86::VMAXCPHZrmb, TB_BCAST_SS},
+  {X86::VMAXCPHZ128rr, X86::VMAXCPHZ128rmb, TB_BCAST_SH},
+  {X86::VMAXCPHZ256rr, X86::VMAXCPHZ256rmb, TB_BCAST_SH},
+  {X86::VMAXCPHZrr, X86::VMAXCPHZrmb, TB_BCAST_SH},
   {X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rmb, TB_BCAST_SS},
   {X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rmb, TB_BCAST_SS},
   {X86::VMAXCPSZrr, X86::VMAXCPSZrmb, TB_BCAST_SS},
@@ -7166,9 +7166,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VMINCPDZ128rr, X86::VMINCPDZ128rmb, TB_BCAST_SD},
   {X86::VMINCPDZ256rr, X86::VMINCPDZ256rmb, TB_BCAST_SD},
   {X86::VMINCPDZrr, X86::VMINCPDZrmb, TB_BCAST_SD},
-  {X86::VMINCPHZ128rr, X86::VMINCPHZ128rmb, TB_BCAST_SS},
-  {X86::VMINCPHZ256rr, X86::VMINCPHZ256rmb, TB_BCAST_SS},
-  {X86::VMINCPHZrr, X86::VMINCPHZrmb, TB_BCAST_SS},
+  {X86::VMINCPHZ128rr, X86::VMINCPHZ128rmb, TB_BCAST_SH},
+  {X86::VMINCPHZ256rr, X86::VMINCPHZ256rmb, TB_BCAST_SH},
+  {X86::VMINCPHZrr, X86::VMINCPHZrmb, TB_BCAST_SH},
   {X86::VMINCPSZ128rr, X86::VMINCPSZ128rmb, TB_BCAST_SS},
   {X86::VMINCPSZ256rr, X86::VMINCPSZ256rmb, TB_BCAST_SS},
   {X86::VMINCPSZrr, X86::VMINCPSZrmb, TB_BCAST_SS},
@@ -7442,15 +7442,15 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VPTESTNMQZ128rr, X86::VPTESTNMQZ128rmb, TB_BCAST_Q},
   {X86::VPTESTNMQZ256rr, X86::VPTESTNMQZ256rmb, TB_BCAST_Q},
   {X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q},
-  {X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rmb, TB_BCAST_Q},
-  {X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rmb, TB_BCAST_Q},
-  {X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrmb, TB_BCAST_Q},
+  {X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rmb, TB_BCAST_D},
+  {X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rmb, TB_BCAST_D},
+  {X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrmb, TB_BCAST_D},
   {X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rmb, TB_BCAST_Q},
   {X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rmb, TB_BCAST_Q},
   {X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrmb, TB_BCAST_Q},
-  {X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rmb, TB_BCAST_Q},
-  {X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rmb, TB_BCAST_Q},
-  {X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrmb, TB_BCAST_Q},
+  {X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rmb, TB_BCAST_D},
+  {X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rmb, TB_BCAST_D},
+  {X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrmb, TB_BCAST_D},
   {X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rmb, TB_BCAST_Q},
   {X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rmb, TB_BCAST_Q},
   {X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrmb, TB_BCAST_Q},
@@ -7610,15 +7610,15 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmbik, TB_BCAST_SS},
   {X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmbik, TB_BCAST_SS},
   {X86::VCMPPSZrrik, X86::VCMPPSZrmbik, TB_BCAST_SS},
-  {X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmbk, TB_BCAST_SD},
-  {X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmbk, TB_BCAST_SD},
-  {X86::VCVTDQ2PDZrrk, X86::VCVTDQ2PDZrmbk, TB_BCAST_SD},
-  {X86::VCVTDQ2PHZ128rrk, X86::VCVTDQ2PHZ128rmbk, TB_BCAST_SH},
-  {X86::VCVTDQ2PHZ256rrk, X86::VCVTDQ2PHZ256rmbk, TB_BCAST_SH},
-  {X86::VCVTDQ2PHZrrk, X86::VCVTDQ2PHZrmbk, TB_BCAST_SH},
-  {X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmbk, TB_BCAST_SS},
-  {X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmbk, TB_BCAST_SS},
-  {X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmbk, TB_BCAST_SS},
+  {X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmbk, TB_BCAST_D},
+  {X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmbk, TB_BCAST_D},
+  {X86::VCVTDQ2PDZrrk, X86::VCVTDQ2PDZrmbk, TB_BCAST_D},
+  {X86::VCVTDQ2PHZ128rrk, X86::VCVTDQ2PHZ128rmbk, TB_BCAST_D},
+  {X86::VCVTDQ2PHZ256rrk, X86::VCVTDQ2PHZ256rmbk, TB_BCAST_D},
+  {X86::VCVTDQ2PHZrrk, X86::VCVTDQ2PHZrmbk, TB_BCAST_D},
+  {X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmbk, TB_BCAST_D},
+  {X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmbk, TB_BCAST_D},
+  {X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmbk, TB_BCAST_D},
   {X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmbkz, TB_BCAST_SS},
   {X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmbkz, TB_BCAST_SS},
   {X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmbkz, TB_BCAST_SS},
@@ -7631,9 +7631,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VCVTPD2PHZ128rrk, X86::VCVTPD2PHZ128rmbk, TB_BCAST_SD},
   {X86::VCVTPD2PHZ256rrk, X86::VCVTPD2PHZ256rmbk, TB_BCAST_SD},
   {X86::VCVTPD2PHZrrk, X86::VCVTPD2PHZrmbk, TB_BCAST_SD},
-  {X86::VCVTPD2PSZ128rrk, X86::VCVTPD2PSZ128rmbk, TB_BCAST_SS},
-  {X86::VCVTPD2PSZ256rrk, X86::VCVTPD2PSZ256rmbk, TB_BCAST_SS},
-  {X86::VCVTPD2PSZrrk, X86::VCVTPD2PSZrmbk, TB_BCAST_SS},
+  {X86::VCVTPD2PSZ128rrk, X86::VCVTPD2PSZ128rmbk, TB_BCAST_SD},
+  {X86::VCVTPD2PSZ256rrk, X86::VCVTPD2PSZ256rmbk, TB_BCAST_SD},
+  {X86::VCVTPD2PSZrrk, X86::VCVTPD2PSZrmbk, TB_BCAST_SD},
   {X86::VCVTPD2QQZ128rrk, X86::VCVTPD2QQZ128rmbk, TB_BCAST_SD},
   {X86::VCVTPD2QQZ256rrk, X86::VCVTPD2QQZ256rmbk, TB_BCAST_SD},
   {X86::VCVTPD2QQZrrk, X86::VCVTPD2QQZrmbk, TB_BCAST_SD},
@@ -7685,15 +7685,15 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VCVTPS2UQQZ128rrk, X86::VCVTPS2UQQZ128rmbk, TB_BCAST_SS},
   {X86::VCVTPS2UQQZ256rrk, X86::VCVTPS2UQQZ256rmbk, TB_BCAST_SS},
   {X86::VCVTPS2UQQZrrk, X86::VCVTPS2UQQZrmbk, TB_BCAST_SS},
-  {X86::VCVTQQ2PDZ128rrk, X86::VCVTQQ2PDZ128rmbk, TB_BCAST_SD},
-  {X86::VCVTQQ2PDZ256rrk, X86::VCVTQQ2PDZ256rmbk, TB_BCAST_SD},
-  {X86::VCVTQQ2PDZrrk, X86::VCVTQQ2PDZrmbk, TB_BCAST_SD},
-  {X86::VCVTQQ2PHZ128rrk, X86::VCVTQQ2PHZ128rmbk, TB_BCAST_SH},
-  {X86::VCVTQQ2PHZ256rrk, X86::VCVTQQ2PHZ256rmbk, TB_BCAST_SH},
-  {X86::VCVTQQ2PHZrrk, X86::VCVTQQ2PHZrmbk, TB_BCAST_SH},
-  {X86::VCVTQQ2PSZ128rrk, X86::VCVTQQ2PSZ128rmbk, TB_BCAST_SS},
-  {X86::VCVTQQ2PSZ256rrk, X86::VCVTQQ2PSZ256rmbk, TB_BCAST_SS},
-  {X86::VCVTQQ2PSZrrk, X86::VCVTQQ2PSZrmbk, TB_BCAST_SS},
+  {X86::VCVTQQ2PDZ128rrk, X86::VCVTQQ2PDZ128rmbk, TB_BCAST_Q},
+  {X86::VCVTQQ2PDZ256rrk, X86::VCVTQQ2PDZ256rmbk, TB_BCAST_Q},
+  {X86::VCVTQQ2PDZrrk, X86::VCVTQQ2PDZrmbk, TB_BCAST_Q},
+  {X86::VCVTQQ2PHZ128rrk, X86::VCVTQQ2PHZ128rmbk, TB_BCAST_Q},
+  {X86::VCVTQQ2PHZ256rrk, X86::VCVTQQ2PHZ256rmbk, TB_BCAST_Q},
+  {X86::VCVTQQ2PHZrrk, X86::VCVTQQ2PHZrmbk, TB_BCAST_Q},
+  {X86::VCVTQQ2PSZ128rrk, X86::VCVTQQ2PSZ128rmbk, TB_BCAST_Q},
+  {X86::VCVTQQ2PSZ256rrk, X86::VCVTQQ2PSZ256rmbk, TB_BCAST_Q},
+  {X86::VCVTQQ2PSZrrk, X86::VCVTQQ2PSZrmbk, TB_BCAST_Q},
   {X86::VCVTTPD2DQZ128rrk, X86::VCVTTPD2DQZ128rmbk, TB_BCAST_SD},
   {X86::VCVTTPD2DQZ256rrk, X86::VCVTTPD2DQZ256rmbk, TB_BCAST_SD},
   {X86::VCVTTPD2DQZrrk, X86::VCVTTPD2DQZrmbk, TB_BCAST_SD},
@@ -7736,30 +7736,30 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VCVTTPS2UQQZ128rrk, X86::VCVTTPS2UQQZ128rmbk, TB_BCAST_SS},
   {X86::VCVTTPS2UQQZ256rrk, X86::VCVTTPS2UQQZ256rmbk, TB_BCAST_SS},
   {X86::VCVTTPS2UQQZrrk, X86::VCVTTPS2UQQZrmbk, TB_BCAST_SS},
-  {X86::VCVTUDQ2PDZ128rrk, X86::VCVTUDQ2PDZ128rmbk, TB_BCAST_SD},
-  {X86::VCVTUDQ2PDZ256rrk, X86::VCVTUDQ2PDZ256rmbk, TB_BCAST_SD},
-  {X86::VCVTUDQ2PDZrrk, X86::VCVTUDQ2PDZrmbk, TB_BCAST_SD},
-  {X86::VCVTUDQ2PHZ128rrk, X86::VCVTUDQ2PHZ128rmbk, TB_BCAST_SH},
-  {X86::VCVTUDQ2PHZ256rrk, X86::VCVTUDQ2PHZ256rmbk, TB_BCAST_SH},
-  {X86::VCVTUDQ2PHZrrk, X86::VCVTUDQ2PHZrmbk, TB_BCAST_SH},
-  {X86::VCVTUDQ2PSZ128rrk, X86::VCVTUDQ2PSZ128rmbk, TB_BCAST_SS},
-  {X86::VCVTUDQ2PSZ256rrk, X86::VCVTUDQ2PSZ256rmbk, TB_BCAST_SS},
-  {X86::VCVTUDQ2PSZrrk, X86::VCVTUDQ2PSZrmbk, TB_BCAST_SS},
-  {X86::VCVTUQQ2PDZ128rrk, X86::VCVTUQQ2PDZ128rmbk, TB_BCAST_SD},
-  {X86::VCVTUQQ2PDZ256rrk, X86::VCVTUQQ2PDZ256rmbk, TB_BCAST_SD},
-  {X86::VCVTUQQ2PDZrrk, X86::VCVTUQQ2PDZrmbk, TB_BCAST_SD},
-  {X86::VCVTUQQ2PHZ128rrk, X86::VCVTUQQ2PHZ128rmbk, TB_BCAST_SH},
-  {X86::VCVTUQQ2PHZ256rrk, X86::VCVTUQQ2PHZ256rmbk, TB_BCAST_SH},
-  {X86::VCVTUQQ2PHZrrk, X86::VCVTUQQ2PHZrmbk, TB_BCAST_SH},
-  {X86::VCVTUQQ2PSZ128rrk, X86::VCVTUQQ2PSZ128rmbk, TB_BCAST_SS},
-  {X86::VCVTUQQ2PSZ256rrk, X86::VCVTUQQ2PSZ256rmbk, TB_BCAST_SS},
-  {X86::VCVTUQQ2PSZrrk, X86::VCVTUQQ2PSZrmbk, TB_BCAST_SS},
-  {X86::VCVTUW2PHZ128rrk, X86::VCVTUW2PHZ128rmbk, TB_BCAST_SH},
-  {X86::VCVTUW2PHZ256rrk, X86::VCVTUW2PHZ256rmbk, TB_BCAST_SH},
-  {X86::VCVTUW2PHZrrk, X86::VCVTUW2PHZrmbk, TB_BCAST_SH},
-  {X86::VCVTW2PHZ128rrk, X86::VCVTW2PHZ128rmbk, TB_BCAST_SH},
-  {X86::VCVTW2PHZ256rrk, X86::VCVTW2PHZ256rmbk, TB_BCAST_SH},
-  {X86::VCVTW2PHZrrk, X86::VCVTW2PHZrmbk, TB_BCAST_SH},
+  {X86::VCVTUDQ2PDZ128rrk, X86::VCVTUDQ2PDZ128rmbk, TB_BCAST_D},
+  {X86::VCVTUDQ2PDZ256rrk, X86::VCVTUDQ2PDZ256rmbk, TB_BCAST_D},
+  {X86::VCVTUDQ2PDZrrk, X86::VCVTUDQ2PDZrmbk, TB_BCAST_D},
+  {X86::VCVTUDQ2PHZ128rrk, X86::VCVTUDQ2PHZ128rmbk, TB_BCAST_D},
+  {X86::VCVTUDQ2PHZ256rrk, X86::VCVTUDQ2PHZ256rmbk, TB_BCAST_D},
+  {X86::VCVTUDQ2PHZrrk, X86::VCVTUDQ2PHZrmbk, TB_BCAST_D},
+  {X86::VCVTUDQ2PSZ128rrk, X86::VCVTUDQ2PSZ128rmbk, TB_BCAST_D},
+  {X86::VCVTUDQ2PSZ256rrk, X86::VCVTUDQ2PSZ256rmbk, TB_BCAST_D},
+  {X86::VCVTUDQ2PSZrrk, X86::VCVTUDQ2PSZrmbk, TB_BCAST_D},
+  {X86::VCVTUQQ2PDZ128rrk, X86::VCVTUQQ2PDZ128rmbk, TB_BCAST_Q},
+  {X86::VCVTUQQ2PDZ256rrk, X86::VCVTUQQ2PDZ256rmbk, TB_BCAST_Q},
+  {X86::VCVTUQQ2PDZrrk, X86::VCVTUQQ2PDZrmbk, TB_BCAST_Q},
+  {X86::VCVTUQQ2PHZ128rrk, X86::VCVTUQQ2PHZ128rmbk, TB_BCAST_Q},
+  {X86::VCVTUQQ2PHZ256rrk, X86::VCVTUQQ2PHZ256rmbk, TB_BCAST_Q},
+  {X86::VCVTUQQ2PHZrrk, X86::VCVTUQQ2PHZrmbk, TB_BCAST_Q},
+  {X86::VCVTUQQ2PSZ128rrk, X86::VCVTUQQ2PSZ128rmbk, TB_BCAST_Q},
+  {X86::VCVTUQQ2PSZ256rrk, X86::VCVTUQQ2PSZ256rmbk, TB_BCAST_Q},
+  {X86::VCVTUQQ2PSZrrk, X86::VCVTUQQ2PSZrmbk, TB_BCAST_Q},
+  {X86::VCVTUW2PHZ128rrk, X86::VCVTUW2PHZ128rmbk, TB_BCAST_W},
+  {X86::VCVTUW2PHZ256rrk, X86::VCVTUW2PHZ256rmbk, TB_BCAST_W},
+  {X86::VCVTUW2PHZrrk, X86::VCVTUW2PHZrmbk, TB_BCAST_W},
+  {X86::VCVTW2PHZ128rrk, X86::VCVTW2PHZ128rmbk, TB_BCAST_W},
+  {X86::VCVTW2PHZ256rrk, X86::VCVTW2PHZ256rmbk, TB_BCAST_W},
+  {X86::VCVTW2PHZrrk, X86::VCVTW2PHZrmbk, TB_BCAST_W},
   {X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmbkz, TB_BCAST_SD},
   {X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmbkz, TB_BCAST_SD},
   {X86::VDIVPDZrrkz, X86::VDIVPDZrmbkz, TB_BCAST_SD},
@@ -7981,9 +7981,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmbkz, TB_BCAST_SD},
   {X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmbkz, TB_BCAST_SD},
   {X86::VMAXCPDZrrkz, X86::VMAXCPDZrmbkz, TB_BCAST_SD},
-  {X86::VMAXCPHZ128rrkz, X86::VMAXCPHZ128rmbkz, TB_BCAST_SS},
-  {X86::VMAXCPHZ256rrkz, X86::VMAXCPHZ256rmbkz, TB_BCAST_SS},
-  {X86::VMAXCPHZrrkz, X86::VMAXCPHZrmbkz, TB_BCAST_SS},
+  {X86::VMAXCPHZ128rrkz, X86::VMAXCPHZ128rmbkz, TB_BCAST_SH},
+  {X86::VMAXCPHZ256rrkz, X86::VMAXCPHZ256rmbkz, TB_BCAST_SH},
+  {X86::VMAXCPHZrrkz, X86::VMAXCPHZrmbkz, TB_BCAST_SH},
   {X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmbkz, TB_BCAST_SS},
   {X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmbkz, TB_BCAST_SS},
   {X86::VMAXCPSZrrkz, X86::VMAXCPSZrmbkz, TB_BCAST_SS},
@@ -7999,9 +7999,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmbkz, TB_BCAST_SD},
   {X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmbkz, TB_BCAST_SD},
   {X86::VMINCPDZrrkz, X86::VMINCPDZrmbkz, TB_BCAST_SD},
-  {X86::VMINCPHZ128rrkz, X86::VMINCPHZ128rmbkz, TB_BCAST_SS},
-  {X86::VMINCPHZ256rrkz, X86::VMINCPHZ256rmbkz, TB_BCAST_SS},
-  {X86::VMINCPHZrrkz, X86::VMINCPHZrmbkz, TB_BCAST_SS},
+  {X86::VMINCPHZ128rrkz, X86::VMINCPHZ128rmbkz, TB_BCAST_SH},
+  {X86::VMINCPHZ256rrkz, X86::VMINCPHZ256rmbkz, TB_BCAST_SH},
+  {X86::VMINCPHZrrkz, X86::VMINCPHZrmbkz, TB_BCAST_SH},
   {X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmbkz, TB_BCAST_SS},
   {X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmbkz, TB_BCAST_SS},
   {X86::VMINCPSZrrkz, X86::VMINCPSZrmbkz, TB_BCAST_SS},
@@ -8095,15 +8095,15 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmbk, TB_BCAST_Q},
   {X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmbk, TB_BCAST_Q},
   {X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmbk, TB_BCAST_Q},
-  {X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128mb, TB_BCAST_SD},
-  {X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256mb, TB_BCAST_SD},
-  {X86::VPDPBUSDSZr, X86::VPDPBUSDSZmb, TB_BCAST_SD},
+  {X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128mb, TB_BCAST_D},
+  {X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256mb, TB_BCAST_D},
+  {X86::VPDPBUSDSZr, X86::VPDPBUSDSZmb, TB_BCAST_D},
   {X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128mb, TB_BCAST_D},
   {X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256mb, TB_BCAST_D},
   {X86::VPDPBUSDZr, X86::VPDPBUSDZmb, TB_BCAST_D},
-  {X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128mb, TB_BCAST_SD},
-  {X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256mb, TB_BCAST_SD},
-  {X86::VPDPWSSDSZr, X86::VPDPWSSDSZmb, TB_BCAST_SD},
+  {X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128mb, TB_BCAST_D},
+  {X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256mb, TB_BCAST_D},
+  {X86::VPDPWSSDSZr, X86::VPDPWSSDSZmb, TB_BCAST_D},
   {X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128mb, TB_BCAST_D},
   {X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256mb, TB_BCAST_D},
   {X86::VPDPWSSDZr, X86::VPDPWSSDZmb, TB_BCAST_D},
@@ -8329,15 +8329,15 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VPTESTNMQZ128rrk, X86::VPTESTNMQZ128rmbk, TB_BCAST_Q},
   {X86::VPTESTNMQZ256rrk, X86::VPTESTNMQZ256rmbk, TB_BCAST_Q},
   {X86::VPTESTNMQZrrk, X86::VPTESTNMQZrmbk, TB_BCAST_Q},
-  {X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmbkz, TB_BCAST_Q},
-  {X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmbkz, TB_BCAST_Q},
-  {X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmbkz, TB_BCAST_Q},
+  {X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmbkz, TB_BCAST_D},
+  {X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmbkz, TB_BCAST_D},
+  {X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmbkz, TB_BCAST_D},
   {X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmbkz, TB_BCAST_Q},
   {X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmbkz, TB_BCAST_Q},
   {X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmbkz, TB_BCAST_Q},
-  {X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmbkz, TB_BCAST_Q},
-  {X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmbkz, TB_BCAST_Q},
-  {X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmbkz, TB_BCAST_Q},
+  {X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmbkz, TB_BCAST_D},
+  {X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmbkz, TB_BCAST_D},
+  {X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmbkz, TB_BCAST_D},
   {X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmbkz, TB_BCAST_Q},
   {X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmbkz, TB_BCAST_Q},
   {X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmbkz, TB_BCAST_Q},
@@ -8863,9 +8863,9 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmbk, TB_BCAST_SD},
   {X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmbk, TB_BCAST_SD},
   {X86::VMAXCPDZrrk, X86::VMAXCPDZrmbk, TB_BCAST_SD},
-  {X86::VMAXCPHZ128rrk, X86::VMAXCPHZ128rmbk, TB_BCAST_SS},
-  {X86::VMAXCPHZ256rrk, X86::VMAXCPHZ256rmbk, TB_BCAST_SS},
-  {X86::VMAXCPHZrrk, X86::VMAXCPHZrmbk, TB_BCAST_SS},
+  {X86::VMAXCPHZ128rrk, X86::VMAXCPHZ128rmbk, TB_BCAST_SH},
+  {X86::VMAXCPHZ256rrk, X86::VMAXCPHZ256rmbk, TB_BCAST_SH},
+  {X86::VMAXCPHZrrk, X86::VMAXCPHZrmbk, TB_BCAST_SH},
   {X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmbk, TB_BCAST_SS},
   {X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmbk, TB_BCAST_SS},
   {X86::VMAXCPSZrrk, X86::VMAXCPSZrmbk, TB_BCAST_SS},
@@ -8881,9 +8881,9 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmbk, TB_BCAST_SD},
   {X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmbk, TB_BCAST_SD},
   {X86::VMINCPDZrrk, X86::VMINCPDZrmbk, TB_BCAST_SD},
-  {X86::VMINCPHZ128rrk, X86::VMINCPHZ128rmbk, TB_BCAST_SS},
-  {X86::VMINCPHZ256rrk, X86::VMINCPHZ256rmbk, TB_BCAST_SS},
-  {X86::VMINCPHZrrk, X86::VMINCPHZrmbk, TB_BCAST_SS},
+  {X86::VMINCPHZ128rrk, X86::VMINCPHZ128rmbk, TB_BCAST_SH},
+  {X86::VMINCPHZ256rrk, X86::VMINCPHZ256rmbk, TB_BCAST_SH},
+  {X86::VMINCPHZrrk, X86::VMINCPHZrmbk, TB_BCAST_SH},
   {X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmbk, TB_BCAST_SS},
   {X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmbk, TB_BCAST_SS},
   {X86::VMINCPSZrrk, X86::VMINCPSZrmbk, TB_BCAST_SS},
@@ -8935,24 +8935,24 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VPANDQZ128rrk, X86::VPANDQZ128rmbk, TB_BCAST_Q},
   {X86::VPANDQZ256rrk, X86::VPANDQZ256rmbk, TB_BCAST_Q},
   {X86::VPANDQZrrk, X86::VPANDQZrmbk, TB_BCAST_Q},
-  {X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mbk, TB_BCAST_SD},
-  {X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mbkz, TB_BCAST_SD},
-  {X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mbk, TB_BCAST_SD},
-  {X86::VPDPBUSDSZ256rkz, X86::VPDPBUSDSZ256mbkz, TB_BCAST_SD},
-  {X86::VPDPBUSDSZrk, X86::VPDPBUSDSZmbk, TB_BCAST_SD},
-  {X86::VPDPBUSDSZrkz, X86::VPDPBUSDSZmbkz, TB_BCAST_SD},
+  {X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mbk, TB_BCAST_D},
+  {X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mbkz, TB_BCAST_D},
+  {X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mbk, TB_BCAST_D},
+  {X86::VPDPBUSDSZ256rkz, X86::VPDPBUSDSZ256mbkz, TB_BCAST_D},
+  {X86::VPDPBUSDSZrk, X86::VPDPBUSDSZmbk, TB_BCAST_D},
+  {X86::VPDPBUSDSZrkz, X86::VPDPBUSDSZmbkz, TB_BCAST_D},
   {X86::VPDPBUSDZ128rk, X86::VPDPBUSDZ128mbk, TB_BCAST_D},
   {X86::VPDPBUSDZ128rkz, X86::VPDPBUSDZ128mbkz, TB_BCAST_D},
   {X86::VPDPBUSDZ256rk, X86::VPDPBUSDZ256mbk, TB_BCAST_D},
   {X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mbkz, TB_BCAST_D},
   {X86::VPDPBUSDZrk, X86::VPDPBUSDZmbk, TB_BCAST_D},
   {X86::VPDPBUSDZrkz, X86::VPDPBUSDZmbkz, TB_BCAST_D},
-  {X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mbk, TB_BCAST_SD},
-  {X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mbkz, TB_BCAST_SD},
-  {X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mbk, TB_BCAST_SD},
-  {X86::VPDPWSSDSZ256rkz, X86::VPDPWSSDSZ256mbkz, TB_BCAST_SD},
-  {X86::VPDPWSSDSZrk, X86::VPDPWSSDSZmbk, TB_BCAST_SD},
-  {X86::VPDPWSSDSZrkz, X86::VPDPWSSDSZmbkz, TB_BCAST_SD},
+  {X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mbk, TB_BCAST_D},
+  {X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mbkz, TB_BCAST_D},
+  {X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mbk, TB_BCAST_D},
+  {X86::VPDPWSSDSZ256rkz, X86::VPDPWSSDSZ256mbkz, TB_BCAST_D},
+  {X86::VPDPWSSDSZrk, X86::VPDPWSSDSZmbk, TB_BCAST_D},
+  {X86::VPDPWSSDSZrkz, X86::VPDPWSSDSZmbkz, TB_BCAST_D},
   {X86::VPDPWSSDZ128rk, X86::VPDPWSSDZ128mbk, TB_BCAST_D},
   {X86::VPDPWSSDZ128rkz, X86::VPDPWSSDZ128mbkz, TB_BCAST_D},
   {X86::VPDPWSSDZ256rk, X86::VPDPWSSDZ256mbk, TB_BCAST_D},
@@ -9162,15 +9162,15 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VPTERNLOGQZ256rrikz, X86::VPTERNLOGQZ256rmbikz, TB_BCAST_Q},
   {X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmbik, TB_BCAST_Q},
   {X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmbikz, TB_BCAST_Q},
-  {X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmbk, TB_BCAST_Q},
-  {X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmbk, TB_BCAST_Q},
-  {X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmbk, TB_BCAST_Q},
+  {X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmbk, TB_BCAST_D},
+  {X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmbk, TB_BCAST_D},
+  {X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmbk, TB_BCAST_D},
   {X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmbk, TB_BCAST_Q},
   {X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmbk, TB_BCAST_Q},
   {X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmbk, TB_BCAST_Q},
-  {X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmbk, TB_BCAST_Q},
-  {X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmbk, TB_BCAST_Q},
-  {X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmbk, TB_BCAST_Q},
+  {X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmbk, TB_BCAST_D},
+  {X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmbk, TB_BCAST_D},
+  {X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmbk, TB_BCAST_D},
   {X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmbk, TB_BCAST_Q},
   {X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmbk, TB_BCAST_Q},
   {X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmbk, TB_BCAST_Q},
diff --git a/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
new file mode 100644
index 00000000000000..1e1669b29b0db6
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p argpromotion -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+@f = dso_local global { i16, i64 } { i16 1, i64 0 }, align 8
+
+; Test case for https://github.com/llvm/llvm-project/issues/84807.
+
+; Make sure the loads from @callee are not moved to @caller, as the store
+; in %then may aliases to load from %q.
+
+define i32 @caller1(i1 %c) {
+; CHECK-LABEL: define i32 @caller1(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @callee1(ptr noundef nonnull @f, i1 [[C]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  call void @callee1(ptr noundef nonnull @f, i1 %c)
+  ret i32 0
+}
+
+define internal void @callee1(ptr nocapture noundef readonly %q, i1 %c) {
+; CHECK-LABEL: define internal void @callee1(
+; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store i16 123, ptr @f, align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8
+; CHECK-NEXT:    [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %then, label %exit
+
+then:
+  store i16 123, ptr @f, align 8
+  br label %exit
+
+exit:
+  %l.0 = load i16, ptr %q, align 8
+  %gep.8  = getelementptr inbounds i8, ptr %q, i64 8
+  %l.1 = load i64, ptr %gep.8, align 8
+  call void @use(i16 %l.0, i64 %l.1)
+  ret void
+
+  uselistorder ptr %q, { 1, 0 }
+}
+
+; Same as @caller1/callee2, but with default uselist order.
+define i32 @caller2(i1 %c) {
+; CHECK-LABEL: define i32 @caller2(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @callee2(ptr noundef nonnull @f, i1 [[C]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  call void @callee2(ptr noundef nonnull @f, i1 %c)
+  ret i32 0
+}
+
+define internal void @callee2(ptr nocapture noundef readonly %q, i1 %c) {
+; CHECK-LABEL: define internal void @callee2(
+; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store i16 123, ptr @f, align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8
+; CHECK-NEXT:    [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %then, label %exit
+
+then:
+  store i16 123, ptr @f, align 8
+  br label %exit
+
+exit:
+  %l.0 = load i16, ptr %q, align 8
+  %gep.8  = getelementptr inbounds i8, ptr %q, i64 8
+  %l.1 = load i64, ptr %gep.8, align 8
+  call void @use(i16 %l.0, i64 %l.1)
+  ret void
+}
+
+declare void @use(i16, i64)
diff --git a/llvm/test/Transforms/Attributor/nofpclass-fpext.ll b/llvm/test/Transforms/Attributor/nofpclass-fpext.ll
index 0ba114117ceec6..ee36f949529d4f 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-fpext.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-fpext.ll
@@ -142,7 +142,7 @@ define double @ret_fpext_f32_to_f64_nosub(float nofpclass(sub) %arg0) {
 }
 
 define double @ret_fpext_f32_to_f64_nonorm(float nofpclass(norm) %arg0) {
-; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_f32_to_f64_nonorm
+; CHECK-LABEL: define nofpclass(sub) double @ret_fpext_f32_to_f64_nonorm
 ; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
@@ -482,7 +482,37 @@ define double @ret_fpext_bf16_f64_nosub(bfloat nofpclass(sub) %arg0) {
 }
 
 define double @ret_fpext_bf16_f64_nonorm(bfloat nofpclass(norm) %arg0) {
-; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_bf16_f64_nonorm
+; CHECK-LABEL: define nofpclass(sub) double @ret_fpext_bf16_f64_nonorm
+; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %ext = fpext bfloat %arg0 to double
+  ret double %ext
+}
+
+define double @ret_fpext_bf16_f64_nonorm_psub(bfloat nofpclass(norm psub) %arg0) {
+; CHECK-LABEL: define nofpclass(sub pnorm) double @ret_fpext_bf16_f64_nonorm_psub
+; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %ext = fpext bfloat %arg0 to double
+  ret double %ext
+}
+
+define double @ret_fpext_bf16_f64_nonorm_nsub(bfloat nofpclass(norm nsub) %arg0) {
+; CHECK-LABEL: define nofpclass(sub nnorm) double @ret_fpext_bf16_f64_nonorm_nsub
+; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %ext = fpext bfloat %arg0 to double
+  ret double %ext
+}
+
+define double @ret_fpext_bf16_f64_nonorm_sub(bfloat nofpclass(norm sub) %arg0) {
+; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_bf16_f64_nonorm_sub
 ; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
index ff5cef7e781fe6..25dfb3c53a077b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
@@ -211,6 +211,29 @@ else:
   ret i32 %l
 }
 
+define i32 @sub10_else_drop_nuw(i32 %a) {
+; CHECK-LABEL: @sub10_else_drop_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L:%.*]] = sub i32 [[A:%.*]], 10
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %c = icmp eq i32 %a, 10
+  br i1 %c, label %then, label %else
+
+then:
+  ret i32 0
+
+else:
+  %l = sub nuw i32 %a, 10
+  ret i32 %l
+}
+
 define i32 @subm10_then(i32 %a) {
 ; CHECK-LABEL: @subm10_then(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll
new file mode 100644
index 00000000000000..a6909d14913494
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=riscv64 < %s | FileCheck %s
+
+define i8 @hoist_add(i8 %x) {
+; CHECK-LABEL: define i8 @hoist_add(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[X]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %cmp = icmp eq i8 %x, -1
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %inc = add nuw nsw i8 %x, 1
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
+
+define i8 @hoist_lshr(i8 %x) {
+; CHECK-LABEL: define i8 @hoist_lshr(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = lshr i8 [[X]], 3
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %cmp = icmp ult i8 %x, 8
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %inc = lshr exact i8 %x, 3
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
+
+define i8 @nomove_add(i8 %x) {
+; CHECK-LABEL: define i8 @nomove_add(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[X]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %inc = add nuw nsw i8 %x, 1
+  %cmp = icmp eq i8 %x, -1
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/ConstraintElimination/minmax.ll b/llvm/test/Transforms/ConstraintElimination/minmax.ll
index a31cf6845ad67d..82b932f14c4ffa 100644
--- a/llvm/test/Transforms/ConstraintElimination/minmax.ll
+++ b/llvm/test/Transforms/ConstraintElimination/minmax.ll
@@ -306,7 +306,9 @@ define i1 @smin_branchless(i32 %x, i32 %y) {
 ; CHECK-SAME: (i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X]], i32 [[Y]])
-; CHECK-NEXT:    [[RET:%.*]] = xor i1 true, false
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[MIN]], [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[MIN]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = xor i1 [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
 entry:
diff --git a/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll b/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll
new file mode 100644
index 00000000000000..6d1d95ec4fdba7
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p constraint-elimination -S %s | FileCheck %s
+
+; Tests for https://github.com/llvm/llvm-project/issues/78621.
+
+define i1 @umin_not_used(i32 %arg) {
+; CHECK-LABEL: define i1 @umin_not_used(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i32 [[ARG]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80)
+; CHECK-NEXT:    [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    ret i1 [[ICMP]]
+;
+  %icmp = icmp slt i32 %arg, 0
+  %shl = shl nuw nsw i32 %arg, 3
+  call i32 @llvm.umin.i32(i32 %shl, i32 80)
+  %cmp2 = shl nuw nsw i32 %arg, 3
+  ret i1 %icmp
+}
+
+define i1 @umin_poison_is_UB_via_call(i32 %arg) {
+; CHECK-LABEL: define i1 @umin_poison_is_UB_via_call(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80)
+; CHECK-NEXT:    call void @noundef(i32 noundef [[MIN]])
+; CHECK-NEXT:    [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    ret i1 false
+;
+  %icmp = icmp slt i32 %arg, 0
+  %shl = shl nuw nsw i32 %arg, 3
+  %min = call i32 @llvm.umin.i32(i32 %shl, i32 80)
+  call void @noundef(i32 noundef %min)
+  %cmp2 = shl nuw nsw i32 %arg, 3
+  ret i1 %icmp
+}
+
+define i1 @umin_poison_call_before_UB(i32 %arg) {
+; CHECK-LABEL: define i1 @umin_poison_call_before_UB(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i32 [[ARG]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80)
+; CHECK-NEXT:    call void @fn()
+; CHECK-NEXT:    call void @noundef(i32 noundef [[MIN]])
+; CHECK-NEXT:    [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    ret i1 [[ICMP]]
+;
+  %icmp = icmp slt i32 %arg, 0
+  %shl = shl nuw nsw i32 %arg, 3
+  %min = call i32 @llvm.umin.i32(i32 %shl, i32 80)
+  call void @fn()
+  call void @noundef(i32 noundef %min)
+  %cmp2 = shl nuw nsw i32 %arg, 3
+  ret i1 %icmp
+}
+
+declare i32 @llvm.umin.i32(i32, i32) #0
+
+declare void @noundef(i32 noundef)
+declare void @fn()
diff --git a/llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll b/llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll
new file mode 100644
index 00000000000000..ee9bd6912e2ae4
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=dse < %s | FileCheck %s
+;
+; DSE kills `store i32 44, ptr %struct.byte.4, align 4` but should not kill
+; `call void @llvm.memset.p0.i64(...)`  because it has a clobber read:
+; `%ret = load ptr, ptr %struct.byte.8`
+
+
+%struct.type = type { ptr, ptr }
+
+define ptr @foo(ptr noundef %ptr) {
+; CHECK-LABEL: define ptr @foo(
+; CHECK-SAME: ptr noundef [[PTR:%.*]]) {
+; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false)
+; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %struct.alloca = alloca %struct.type, align 8
+  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
+  ; Set %struct.alloca[8, 16) to 42.
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
+  ; Set %struct.alloca[8, 12) to 43.
+  store i32 43, ptr %struct.byte.8, align 4
+  ; Set %struct.alloca[4, 8) to 44.
+  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
+  store i32 44, ptr %struct.byte.4, align 4
+  ; Return %struct.alloca[8, 16).
+  %ret = load ptr, ptr %struct.byte.8
+  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  ret ptr %ret
+}
+
+; Set of tests based on @foo, but where the memset's operands cannot be erased
+; due to other uses. Instead, they contain a number of removable MemoryDefs;
+; with non-void types result types.
+
+define ptr @foo_with_removable_malloc() {
+; CHECK-LABEL: define ptr @foo_with_removable_malloc() {
+; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    [[STRUCT_BYTE_4:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 4
+; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false)
+; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_4]])
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_8]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %struct.alloca = alloca %struct.type, align 8
+  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
+  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
+
+  ; Set of removable memory deffs
+  %m2 = tail call ptr @malloc(i64 4)
+  %m1 = tail call ptr @malloc(i64 4)
+  store i32 0, ptr %struct.byte.8
+  store i32 0, ptr %struct.byte.8
+  store i32 123, ptr %m1
+  store i32 123, ptr %m2
+
+  ; Set %struct.alloca[8, 16) to 42.
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
+  ; Set %struct.alloca[8, 12) to 43.
+  store i32 43, ptr %struct.byte.8, align 4
+  ; Set %struct.alloca[4, 8) to 44.
+  store i32 44, ptr %struct.byte.4, align 4
+  ; Return %struct.alloca[8, 16).
+  %ret = load ptr, ptr %struct.byte.8
+  call void @readnone(ptr %struct.byte.4);
+  call void @readnone(ptr %struct.byte.8);
+  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  ret ptr %ret
+}
+
+define ptr @foo_with_removable_malloc_free() {
+; CHECK-LABEL: define ptr @foo_with_removable_malloc_free() {
+; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
+; CHECK-NEXT:    [[M1:%.*]] = tail call ptr @malloc(i64 4)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    [[STRUCT_BYTE_4:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 4
+; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
+; CHECK-NEXT:    [[M2:%.*]] = tail call ptr @malloc(i64 4)
+; CHECK-NEXT:    call void @free(ptr [[M1]])
+; CHECK-NEXT:    call void @free(ptr [[M2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false)
+; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_4]])
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_8]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %struct.alloca = alloca %struct.type, align 8
+  %m1 = tail call ptr @malloc(i64 4)
+  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
+  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
+
+  store i32 0, ptr %struct.byte.4
+  store i32 0, ptr %struct.byte.8
+  %m2 = tail call ptr @malloc(i64 4)
+  store i32 123, ptr %m1
+  call void @free(ptr %m1);
+  store i32 123, ptr %m2
+  call void @free(ptr %m2);
+
+  ; Set %struct.alloca[8, 16) to 42.
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
+  ; Set %struct.alloca[8, 12) to 43.
+  store i32 43, ptr %struct.byte.8, align 4
+  ; Set %struct.alloca[4, 8) to 44.
+  store i32 44, ptr %struct.byte.4, align 4
+  ; Return %struct.alloca[8, 16).
+  %ret = load ptr, ptr %struct.byte.8
+  call void @readnone(ptr %struct.byte.4);
+  call void @readnone(ptr %struct.byte.8);
+  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  ret ptr %ret
+}
+
+define ptr @foo_with_malloc_to_calloc() {
+; CHECK-LABEL: define ptr @foo_with_malloc_to_calloc() {
+; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
+; CHECK-NEXT:    [[STRUCT_BYTE_4:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 4
+; CHECK-NEXT:    [[CALLOC1:%.*]] = call ptr @calloc(i64 1, i64 4)
+; CHECK-NEXT:    [[CALLOC:%.*]] = call ptr @calloc(i64 1, i64 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false)
+; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_4]])
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_8]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    call void @use(ptr [[CALLOC1]])
+; CHECK-NEXT:    call void @use(ptr [[CALLOC]])
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %struct.alloca = alloca %struct.type, align 8
+  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
+  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
+
+  ; Set of removable memory deffs
+  %m1 = tail call ptr @malloc(i64 4)
+  %m2 = tail call ptr @malloc(i64 4)
+  store i32 0, ptr %struct.byte.4
+  store i32 0, ptr %struct.byte.8
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %m2, i8 0, i64 4, i1 false)
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %m1, i8 0, i64 4, i1 false)
+
+  ; Set %struct.alloca[8, 16) to 42.
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
+  ; Set %struct.alloca[8, 12) to 43.
+  store i32 43, ptr %struct.byte.8, align 4
+  ; Set %struct.alloca[4, 8) to 44.
+  store i32 44, ptr %struct.byte.4, align 4
+  ; Return %struct.alloca[8, 16).
+  %ret = load ptr, ptr %struct.byte.8
+  call void @readnone(ptr %struct.byte.4);
+  call void @readnone(ptr %struct.byte.8);
+  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  call void @use(ptr %m1)
+  call void @use(ptr %m2)
+  ret ptr %ret
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+
+declare noalias ptr @malloc(i64) willreturn allockind("alloc,uninitialized") "alloc-family"="malloc"
+declare void @readnone(ptr) readnone nounwind
+declare void @free(ptr nocapture) allockind("free") "alloc-family"="malloc"
+
+declare void @use(ptr)
diff --git a/llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll b/llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll
new file mode 100644
index 00000000000000..60a010cd49ceda
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll
@@ -0,0 +1,302 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p dse -S %s | FileCheck %s
+
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+
+define void @widget(ptr %a) {
+; CHECK-LABEL: define void @widget(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call noalias ptr @malloc(i64 0)
+; CHECK-NEXT:    store ptr [[CALL1]], ptr [[A]], align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[A]], align 8
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 8
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i8, ptr [[CALL1]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR3:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR]], i64 1
+; CHECK-NEXT:    [[GETELEMENTPTR4:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR]], i64 8
+; CHECK-NEXT:    store i16 0, ptr [[GETELEMENTPTR4]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR5:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR]], i64 12
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[LOAD6:%.*]] = load i32, ptr inttoptr (i64 4 to ptr), align 4
+; CHECK-NEXT:    br label [[BB48:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    br label [[BB53:%.*]]
+; CHECK:       bb9:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[CALL1]], [[BB7:%.*]] ], [ [[A]], [[BB43:%.*]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR10:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR11:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR12:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR13:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 1
+; CHECK-NEXT:    store i8 0, ptr [[CALL1]], align 1
+; CHECK-NEXT:    br label [[BB29:%.*]]
+; CHECK:       bb14:
+; CHECK-NEXT:    [[GETELEMENTPTR15:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR10]], i64 8
+; CHECK-NEXT:    [[LOAD16:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br i1 false, label [[BB22:%.*]], label [[BB17:%.*]]
+; CHECK:       bb17:
+; CHECK-NEXT:    [[GETELEMENTPTR18:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR11]], i64 8
+; CHECK-NEXT:    [[LOAD19:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR20:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 8
+; CHECK-NEXT:    store i16 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR21:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    br label [[BB25:%.*]]
+; CHECK:       bb22:
+; CHECK-NEXT:    [[GETELEMENTPTR23:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR24:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR23]], i64 12
+; CHECK-NEXT:    br label [[BB25]]
+; CHECK:       bb25:
+; CHECK-NEXT:    [[PHI26:%.*]] = phi ptr [ [[A]], [[BB17]] ], [ [[CALL1]], [[BB22]] ]
+; CHECK-NEXT:    [[PHI27:%.*]] = phi ptr [ [[CALL1]], [[BB17]] ], [ [[CALL1]], [[BB22]] ]
+; CHECK-NEXT:    [[PHI28:%.*]] = phi ptr [ [[CALL1]], [[BB17]] ], [ [[CALL1]], [[BB22]] ]
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB29]]
+; CHECK:       bb29:
+; CHECK-NEXT:    [[PHI30:%.*]] = phi ptr [ [[CALL1]], [[BB9]] ], [ [[CALL1]], [[BB25]] ]
+; CHECK-NEXT:    [[PHI31:%.*]] = phi ptr [ [[CALL1]], [[BB9]] ], [ [[CALL1]], [[BB25]] ]
+; CHECK-NEXT:    [[LOAD32:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[LOAD33:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR34:%.*]] = getelementptr i8, ptr [[PHI31]], i64 12
+; CHECK-NEXT:    [[GETELEMENTPTR35:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 12
+; CHECK-NEXT:    br label [[BB86:%.*]]
+; CHECK:       bb36:
+; CHECK-NEXT:    [[GETELEMENTPTR37:%.*]] = getelementptr i8, ptr [[PHI30]], i64 12
+; CHECK-NEXT:    br label [[BB38:%.*]]
+; CHECK:       bb38:
+; CHECK-NEXT:    [[GETELEMENTPTR39:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR34]], i64 0, i64 0
+; CHECK-NEXT:    [[LOAD40:%.*]] = load i32, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR41:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR37]], i64 0, i64 0
+; CHECK-NEXT:    [[LOAD42:%.*]] = load i32, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB38]]
+; CHECK:       bb43:
+; CHECK-NEXT:    [[GETELEMENTPTR44:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 8
+; CHECK-NEXT:    [[LOAD45:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    store i16 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    store i8 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR46:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 12
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR47:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 16
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB9]]
+; CHECK:       bb48:
+; CHECK-NEXT:    [[GETELEMENTPTR49:%.*]] = getelementptr i8, ptr [[CALL1]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR50:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR49]], i64 1
+; CHECK-NEXT:    [[GETELEMENTPTR51:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR49]], i64 8
+; CHECK-NEXT:    [[GETELEMENTPTR52:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR49]], i64 12
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB48]]
+; CHECK:       bb53:
+; CHECK-NEXT:    [[PHI54:%.*]] = phi ptr [ [[CALL1]], [[BB8:%.*]] ], [ [[A]], [[BB71:%.*]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR55:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR56:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR57:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR58:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 1
+; CHECK-NEXT:    br label [[BB71]]
+; CHECK:       bb59:
+; CHECK-NEXT:    [[GETELEMENTPTR60:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR61:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR60]], i64 12
+; CHECK-NEXT:    br label [[BB67:%.*]]
+; CHECK:       bb62:
+; CHECK-NEXT:    [[GETELEMENTPTR63:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR56]], i64 8
+; CHECK-NEXT:    [[LOAD64:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR65:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 8
+; CHECK-NEXT:    store i16 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR66:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    br label [[BB67]]
+; CHECK:       bb67:
+; CHECK-NEXT:    [[PHI68:%.*]] = phi ptr [ [[A]], [[BB62:%.*]] ], [ [[CALL1]], [[BB59:%.*]] ]
+; CHECK-NEXT:    [[PHI69:%.*]] = phi ptr [ [[CALL1]], [[BB62]] ], [ [[CALL1]], [[BB59]] ]
+; CHECK-NEXT:    [[PHI70:%.*]] = phi ptr [ [[CALL1]], [[BB62]] ], [ [[CALL1]], [[BB59]] ]
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB71]]
+; CHECK:       bb71:
+; CHECK-NEXT:    [[PHI72:%.*]] = phi ptr [ [[CALL1]], [[BB53]] ], [ [[CALL1]], [[BB67]] ]
+; CHECK-NEXT:    [[PHI73:%.*]] = phi ptr [ [[CALL1]], [[BB53]] ], [ [[CALL1]], [[BB67]] ]
+; CHECK-NEXT:    [[LOAD74:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[LOAD75:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR76:%.*]] = getelementptr i8, ptr [[PHI72]], i64 12
+; CHECK-NEXT:    [[GETELEMENTPTR77:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 12
+; CHECK-NEXT:    [[GETELEMENTPTR78:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR76]], i64 0, i64 0
+; CHECK-NEXT:    [[LOAD79:%.*]] = load i32, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR80:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR77]], i64 0, i64 0
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[LOAD81:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR82:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 8
+; CHECK-NEXT:    [[LOAD83:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    store i16 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    store i8 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR84:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 12
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR85:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 16
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB53]]
+; CHECK:       bb86:
+; CHECK-NEXT:    [[GETELEMENTPTR87:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR34]], i64 0, i64 0
+; CHECK-NEXT:    [[LOAD88:%.*]] = load i32, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR89:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR35]], i64 0, i64 0
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB86]]
+;
+bb:
+  %call = tail call ptr @malloc(i64 1)
+  tail call void @llvm.memset.p0.i64(ptr %call, i8 0, i64 1, i1 false)
+  %call1 = tail call noalias ptr @malloc(i64 0)
+  store ptr %call1, ptr %a, align 8
+  %load = load ptr, ptr %a, align 8
+  %load2 = load i32, ptr %load, align 8
+  %getelementptr = getelementptr i8, ptr %call1, i64 0
+  %getelementptr3 = getelementptr i8, ptr %getelementptr, i64 1
+  store i8 0, ptr %call1, align 1
+  %getelementptr4 = getelementptr i8, ptr %getelementptr, i64 8
+  store i16 0, ptr %getelementptr4, align 4
+  %getelementptr5 = getelementptr i8, ptr %getelementptr, i64 12
+  store i32 0, ptr %call1, align 4
+  %load6 = load i32, ptr inttoptr (i64 4 to ptr), align 4
+  br label %bb48
+
+bb7:                                              ; No predecessors!
+  br label %bb9
+
+bb8:                                              ; No predecessors!
+  br label %bb53
+
+bb9:                                              ; preds = %bb43, %bb7
+  %phi = phi ptr [ %call1, %bb7 ], [ %a, %bb43 ]
+  %getelementptr10 = getelementptr i8, ptr %phi, i64 0
+  %getelementptr11 = getelementptr i8, ptr %phi, i64 0
+  %getelementptr12 = getelementptr i8, ptr %phi, i64 0
+  %getelementptr13 = getelementptr i8, ptr %getelementptr12, i64 1
+  store i8 0, ptr %call1, align 1
+  br label %bb29
+
+bb14:                                             ; No predecessors!
+  %getelementptr15 = getelementptr i8, ptr %getelementptr10, i64 8
+  %load16 = load i16, ptr %call1, align 4
+  br i1 false, label %bb22, label %bb17
+
+bb17:                                             ; preds = %bb14
+  %getelementptr18 = getelementptr i8, ptr %getelementptr11, i64 8
+  %load19 = load i16, ptr %call1, align 4
+  %getelementptr20 = getelementptr i8, ptr %getelementptr12, i64 8
+  store i16 0, ptr %call1, align 4
+  %getelementptr21 = getelementptr i8, ptr %phi, i64 0
+  br label %bb25
+
+bb22:                                             ; preds = %bb14
+  %getelementptr23 = getelementptr i8, ptr %phi, i64 0
+  %getelementptr24 = getelementptr i8, ptr %getelementptr23, i64 12
+  br label %bb25
+
+bb25:                                             ; preds = %bb22, %bb17
+  %phi26 = phi ptr [ %a, %bb17 ], [ %call1, %bb22 ]
+  %phi27 = phi ptr [ %call1, %bb17 ], [ %call1, %bb22 ]
+  %phi28 = phi ptr [ %call1, %bb17 ], [ %call1, %bb22 ]
+  store i32 0, ptr %call1, align 4
+  br label %bb29
+
+bb29:                                             ; preds = %bb25, %bb9
+  %phi30 = phi ptr [ %call1, %bb9 ], [ %call1, %bb25 ]
+  %phi31 = phi ptr [ %call1, %bb9 ], [ %call1, %bb25 ]
+  %load32 = load i8, ptr %call1, align 4
+  %load33 = load i8, ptr %call1, align 4
+  %getelementptr34 = getelementptr i8, ptr %phi31, i64 12
+  %getelementptr35 = getelementptr i8, ptr %getelementptr12, i64 12
+  br label %bb86
+
+bb36:                                             ; No predecessors!
+  %getelementptr37 = getelementptr i8, ptr %phi30, i64 12
+  br label %bb38
+
+bb38:                                             ; preds = %bb38, %bb36
+  %getelementptr39 = getelementptr [0 x i32], ptr %getelementptr34, i64 0, i64 0
+  %load40 = load i32, ptr %call1, align 4
+  %getelementptr41 = getelementptr [0 x i32], ptr %getelementptr37, i64 0, i64 0
+  %load42 = load i32, ptr %call1, align 4
+  br label %bb38
+
+bb43:                                             ; No predecessors!
+  %getelementptr44 = getelementptr i8, ptr %getelementptr12, i64 8
+  %load45 = load i16, ptr %call1, align 4
+  store i16 0, ptr %call1, align 4
+  store i8 0, ptr %call1, align 4
+  %getelementptr46 = getelementptr i8, ptr %getelementptr12, i64 12
+  store i32 0, ptr %call1, align 4
+  %getelementptr47 = getelementptr i8, ptr %getelementptr12, i64 16
+  store i32 0, ptr %call1, align 4
+  br label %bb9
+
+bb48:                                             ; preds = %bb48, %bb
+  %getelementptr49 = getelementptr i8, ptr %call1, i64 0
+  %getelementptr50 = getelementptr i8, ptr %getelementptr49, i64 1
+  store i8 0, ptr %call1, align 1
+  %getelementptr51 = getelementptr i8, ptr %getelementptr49, i64 8
+  store i16 0, ptr %call1, align 4
+  %getelementptr52 = getelementptr i8, ptr %getelementptr49, i64 12
+  store i32 0, ptr %call1, align 4
+  br label %bb48
+
+bb53:                                             ; preds = %bb71, %bb8
+  %phi54 = phi ptr [ %call1, %bb8 ], [ %a, %bb71 ]
+  %getelementptr55 = getelementptr i8, ptr %phi54, i64 0
+  %getelementptr56 = getelementptr i8, ptr %phi54, i64 0
+  %getelementptr57 = getelementptr i8, ptr %phi54, i64 0
+  %getelementptr58 = getelementptr i8, ptr %getelementptr57, i64 1
+  br label %bb71
+
+bb59:                                             ; No predecessors!
+  %getelementptr60 = getelementptr i8, ptr %phi54, i64 0
+  %getelementptr61 = getelementptr i8, ptr %getelementptr60, i64 12
+  br label %bb67
+
+bb62:                                             ; No predecessors!
+  %getelementptr63 = getelementptr i8, ptr %getelementptr56, i64 8
+  %load64 = load i16, ptr %call1, align 4
+  %getelementptr65 = getelementptr i8, ptr %getelementptr57, i64 8
+  store i16 0, ptr %call1, align 4
+  %getelementptr66 = getelementptr i8, ptr %phi54, i64 0
+  br label %bb67
+
+bb67:                                             ; preds = %bb62, %bb59
+  %phi68 = phi ptr [ %a, %bb62 ], [ %call1, %bb59 ]
+  %phi69 = phi ptr [ %call1, %bb62 ], [ %call1, %bb59 ]
+  %phi70 = phi ptr [ %call1, %bb62 ], [ %call1, %bb59 ]
+  store i32 0, ptr %call1, align 4
+  br label %bb71
+
+bb71:                                             ; preds = %bb67, %bb53
+  %phi72 = phi ptr [ %call1, %bb53 ], [ %call1, %bb67 ]
+  %phi73 = phi ptr [ %call1, %bb53 ], [ %call1, %bb67 ]
+  %load74 = load i8, ptr %call1, align 4
+  %load75 = load i8, ptr %call1, align 4
+  %getelementptr76 = getelementptr i8, ptr %phi72, i64 12
+  %getelementptr77 = getelementptr i8, ptr %getelementptr57, i64 12
+  %getelementptr78 = getelementptr [0 x i32], ptr %getelementptr76, i64 0, i64 0
+  %load79 = load i32, ptr %call1, align 4
+  %getelementptr80 = getelementptr [0 x i32], ptr %getelementptr77, i64 0, i64 0
+  store i32 0, ptr %call1, align 4
+  %load81 = load i8, ptr %call1, align 4
+  %getelementptr82 = getelementptr i8, ptr %getelementptr57, i64 8
+  %load83 = load i16, ptr %call1, align 4
+  store i16 0, ptr %call1, align 4
+  store i8 0, ptr %call1, align 4
+  %getelementptr84 = getelementptr i8, ptr %getelementptr57, i64 12
+  store i32 0, ptr %call1, align 4
+  %getelementptr85 = getelementptr i8, ptr %getelementptr57, i64 16
+  store i32 0, ptr %call1, align 4
+  br label %bb53
+
+bb86:                                             ; preds = %bb86, %bb29
+  %getelementptr87 = getelementptr [0 x i32], ptr %getelementptr34, i64 0, i64 0
+  %load88 = load i32, ptr %call1, align 4
+  %getelementptr89 = getelementptr [0 x i32], ptr %getelementptr35, i64 0, i64 0
+  store i32 0, ptr %call1, align 4
+  br label %bb86
+}
+
+declare ptr @malloc(i64)
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/Transforms/Float2Int/pr79158.ll b/llvm/test/Transforms/Float2Int/pr79158.ll
new file mode 100644
index 00000000000000..5e78cc0bc66fdb
--- /dev/null
+++ b/llvm/test/Transforms/Float2Int/pr79158.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=float2int -S | FileCheck %s
+
+define i32 @pr79158(i32 %x) {
+; CHECK-LABEL: define i32 @pr79158(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i1 [[CMP]] to i64
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i64 [[TMP0]], 4294967295
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[MUL1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %cmp = icmp sgt i32 %x, 0
+  %conv = uitofp i1 %cmp to double
+  %mul = fmul double %conv, 0x41EFFFFFFFE00000
+  %conv1 = fptoui double %mul to i32
+  ret i32 %conv1
+}
diff --git a/llvm/test/Transforms/GVN/pr82884.ll b/llvm/test/Transforms/GVN/pr82884.ll
new file mode 100644
index 00000000000000..71abafda60d93d
--- /dev/null
+++ b/llvm/test/Transforms/GVN/pr82884.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=gvn < %s | FileCheck %s
+
+; Make sure nsw/nuw flags are dropped.
+
+define i32 @pr82884(i32 %x) {
+; CHECK-LABEL: define i32 @pr82884(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[X]]
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    [[MUL2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[X]], i32 [[X]])
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %mul = mul nsw nuw i32 %x, %x
+  call void @use(i32 %mul)
+  %mul2 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %x, i32 %x)
+  %ret = extractvalue { i32, i1 } %mul2, 0
+  ret i32 %ret
+}
+
+declare void @use(i32)
diff --git a/llvm/test/Transforms/IRCE/pr89959.ll b/llvm/test/Transforms/IRCE/pr89959.ll
new file mode 100644
index 00000000000000..dc7c0dfbc57a97
--- /dev/null
+++ b/llvm/test/Transforms/IRCE/pr89959.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=irce -S < %s 2>&1 | FileCheck %s
+
+; Make sure we don't crash.
+define void @pr89959() {
+; CHECK-LABEL: define void @pr89959() {
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[VALUE_PHI:%.*]] = phi ptr [ null, [[TOP:%.*]] ], [ [[TMP0:%.*]], [[L13:%.*]] ]
+; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[VALUE_PHI]], i64 8
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp ule ptr [[VALUE_PHI]], null
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[L13]], label [[L15:%.*]]
+; CHECK:       L13:
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L15:
+; CHECK-NEXT:    ret void
+;
+top:
+  br label %L3
+
+L3:
+  %value_phi = phi ptr [ null, %top ], [ %0, %L13 ]
+  %0 = getelementptr i8, ptr %value_phi, i64 8
+  %.not = icmp ule ptr %value_phi, null
+  br i1 %.not, label %L13, label %L15
+
+L13:
+  br label %L3
+
+L15:
+  ret void
+}
diff --git a/llvm/test/Transforms/IndVarSimplify/pr55925.ll b/llvm/test/Transforms/IndVarSimplify/pr55925.ll
index 420fc209949d4f..312a8295ccdc9f 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr55925.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr55925.ll
@@ -18,9 +18,9 @@ define void @test(ptr %p) personality ptr undef {
 ; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foo(i32 returned [[TMP0]])
 ; CHECK-NEXT:            to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @foo(i32 [[TMP1]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
@@ -64,8 +64,8 @@ define void @test_critedge(i1 %c, ptr %p) personality ptr undef {
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP1]], [[LOOP_INVOKE]] ], [ 0, [[LOOP_OTHER]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @foo(i32 [[PHI]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
diff --git a/llvm/test/Transforms/IndVarSimplify/pr79861.ll b/llvm/test/Transforms/IndVarSimplify/pr79861.ll
new file mode 100644
index 00000000000000..66250944961397
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/pr79861.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=indvars < %s | FileCheck %s
+
+target datalayout = "n64"
+
+declare void @use(i64)
+
+define void @or_disjoint() {
+; CHECK-LABEL: define void @or_disjoint() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 2, [[ENTRY:%.*]] ], [ [[IV_DEC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[IV]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 false, i64 [[OR]], i64 [[ADD]]
+; CHECK-NEXT:    call void @use(i64 [[SEL]])
+; CHECK-NEXT:    [[IV_DEC]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_DEC]], 0
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 2, %entry ], [ %iv.dec, %loop ]
+  %or = or disjoint i64 %iv, 1
+  %add = add nsw i64 %iv, 1
+  %sel = select i1 false, i64 %or, i64 %add
+  call void @use(i64 %sel)
+
+  %iv.dec = add nsw i64 %iv, -1
+  %exit.cond = icmp eq i64 %iv.dec, 0
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @add_nowrap_flags(i64 %n) {
+; CHECK-LABEL: define void @add_nowrap_flags(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[IV]], 123
+; CHECK-NEXT:    call void @use(i64 [[ADD1]])
+; CHECK-NEXT:    [[IV_INC]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.inc, %loop ]
+  %add1 = add nuw nsw i64 %iv, 123
+  %add2 = add i64 %iv, 123
+  %sel = select i1 false, i64 %add1, i64 %add2
+  call void @use(i64 %sel)
+
+  %iv.inc = add i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.inc, %n
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @expander_or_disjoint(i64 %n) {
+; CHECK-LABEL: define void @expander_or_disjoint(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[N]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_INC]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[IV]], [[OR]]
+; CHECK-NEXT:    call void @use(i64 [[ADD]])
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_INC]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %or = or disjoint i64 %n, 1
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.inc, %loop ]
+  %iv.inc = add i64 %iv, 1
+  %add = add i64 %iv, %or
+  call void @use(i64 %add)
+  %cmp = icmp ult i64 %iv, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
index d6b1f3ef45e765..7723e6c664c3d5 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
@@ -1,71 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline | FileCheck %s
 
-declare void @inlined_body() "aarch64_pstate_sm_compatible";
+declare i32 @llvm.vscale.i32()
 
-; Define some functions that will be called by the functions below.
-; These just call a '...body()' function. If we see the call to one of
-; these functions being replaced by '...body()', then we know it has been
-; inlined.
+; Define some functions that merely call llvm.vscale.i32(), which will be called
+; by the other functions below. If we see the call to one of these functions
+; being replaced by 'llvm.vscale()', then we know it has been inlined.
 
-define void @normal_callee() {
-; CHECK-LABEL: define void @normal_callee
+define i32 @normal_callee() {
+; CHECK-LABEL: define i32 @normal_callee
 ; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale.i32()
+  ret i32 %res
 }
 
-define void @streaming_callee() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: define void @streaming_callee
+define i32 @streaming_callee() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_callee
 ; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale.i32()
+  ret i32 %res
 }
 
-define void @locally_streaming_callee() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @locally_streaming_callee
+define i32 @locally_streaming_callee() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_callee
 ; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale.i32()
+  ret i32 %res
 }
 
-define void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_callee
+define i32 @streaming_compatible_callee() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_callee
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale.i32()
+  ret i32 %res
 }
 
-define void @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @streaming_compatible_locally_streaming_callee
+define i32 @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_callee
 ; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale()
+  ret i32 %res
 }
 
 ; Now test that inlining only happens when their streaming modes match.
@@ -85,16 +84,16 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_normal_callee_inline() {
-; CHECK-LABEL: define void @normal_caller_normal_callee_inline
+define i32 @normal_caller_normal_callee_inline() {
+; CHECK-LABEL: define i32 @normal_caller_normal_callee_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] N  -> N
@@ -102,16 +101,16 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_streaming_callee_inline() {
-; CHECK-LABEL: define void @normal_caller_streaming_callee_inline
+define i32 @normal_caller_streaming_callee_dont_inline() {
+; CHECK-LABEL: define i32 @normal_caller_streaming_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] N  -> N
@@ -119,16 +118,16 @@ entry:
 ; [x] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_streaming_compatible_callee_inline() {
-; CHECK-LABEL: define void @normal_caller_streaming_compatible_callee_inline
+define i32 @normal_caller_streaming_compatible_callee_inline() {
+; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_callee_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] N  -> N
@@ -136,16 +135,16 @@ entry:
 ; [ ] N  -> SC
 ; [x] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_locally_streaming_callee_inline() {
-; CHECK-LABEL: define void @normal_caller_locally_streaming_callee_inline
+define i32 @normal_caller_locally_streaming_callee_dont_inline() {
+; CHECK-LABEL: define i32 @normal_caller_locally_streaming_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @locally_streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] N  -> N
@@ -153,16 +152,16 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [x] N  -> SC + B
-define void @normal_caller_streaming_compatible_locally_streaming_callee_inline() {
-; CHECK-LABEL: define void @normal_caller_streaming_compatible_locally_streaming_callee_inline
+define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() {
+; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
-  ret void
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [x] S  -> N
@@ -170,16 +169,16 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define void @streaming_caller_normal_callee_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: define void @streaming_caller_normal_callee_inline
+define i32 @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_normal_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR2]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] S  -> N
@@ -187,16 +186,16 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define void @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: define void @streaming_caller_streaming_callee_inline
+define i32 @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR2]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] S  -> N
@@ -204,16 +203,16 @@ entry:
 ; [x] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define void @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: define void @streaming_caller_streaming_compatible_callee_inline
+define i32 @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_callee_inline
 ; CHECK-SAME: () #[[ATTR2]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] S  -> N
@@ -221,16 +220,16 @@ entry:
 ; [ ] S  -> SC
 ; [x] S  -> N + B
 ; [ ] S  -> SC + B
-define void @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: define void @streaming_caller_locally_streaming_callee_inline
+define i32 @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_locally_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR2]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] S  -> N
@@ -238,16 +237,16 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [x] S  -> SC + B
-define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline
+define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR2]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
-  ret void
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [x] N + B -> N
@@ -255,16 +254,16 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define void @locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @locally_streaming_caller_normal_callee_inline
+define i32 @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_normal_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] N + B -> N
@@ -272,16 +271,16 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define void @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @locally_streaming_caller_streaming_callee_inline
+define i32 @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] N + B -> N
@@ -289,16 +288,16 @@ entry:
 ; [x] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define void @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @locally_streaming_caller_streaming_compatible_callee_inline
+define i32 @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_callee_inline
 ; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] N + B -> N
@@ -306,16 +305,16 @@ entry:
 ; [ ] N + B -> SC
 ; [x] N + B -> N + B
 ; [ ] N + B -> SC + B
-define void @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @locally_streaming_caller_locally_streaming_callee_inline
+define i32 @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_locally_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] N + B -> N
@@ -323,16 +322,16 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [x] N + B -> SC + B
-define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline
+define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
-  ret void
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [x] SC -> N
@@ -340,16 +339,16 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_caller_normal_callee_inline
+define i32 @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_normal_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] SC -> N
@@ -357,16 +356,16 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_caller_streaming_callee_inline
+define i32 @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] SC -> N
@@ -374,16 +373,16 @@ entry:
 ; [x] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_callee_inline
+define i32 @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_callee_inline
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] SC -> N
@@ -391,16 +390,16 @@ entry:
 ; [ ] SC -> SC
 ; [x] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_caller_locally_streaming_callee_inline
+define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @locally_streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] SC -> N
@@ -408,32 +407,32 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [x] SC -> SC + B
-define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline
+define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
-  ret void
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
 }
 ; [x] SC + B -> N
 ; [ ] SC + B -> S
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_normal_callee_inline
+define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] SC + B -> N
@@ -441,16 +440,16 @@ entry:
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline
+define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] SC + B -> N
@@ -458,16 +457,16 @@ entry:
 ; [x] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline
+define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] SC + B -> N
@@ -475,16 +474,16 @@ entry:
 ; [ ] SC + B -> SC
 ; [x] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline
+define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] SC + B -> N
@@ -492,16 +491,16 @@ entry:
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [x] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_and_callee_inline
+define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @inlined_body()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
-  ret void
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
 }
 
 define void @normal_callee_with_inlineasm() {
diff --git a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
index 3a30980fe31bd7..6f582cab2f1452 100644
--- a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
+++ b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
@@ -93,3 +93,29 @@ define internal void @caller_not_avx4() {
 }
 
 declare i64 @caller_unknown_simple(i64)
+
+; This call should get inlined, because the callee only contains
+; inline ASM, not real calls.
+define <8 x i64> @caller_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #0 {
+; CHECK-LABEL: define {{[^@]+}}@caller_inline_asm
+; CHECK-SAME: (ptr [[P0:%.*]], i64 [[K:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[SRC_I:%.*]] = load <8 x i64>, ptr [[P0]], align 64
+; CHECK-NEXT:    [[A_I:%.*]] = load <8 x i64>, ptr [[P1]], align 64
+; CHECK-NEXT:    [[B_I:%.*]] = load <8 x i64>, ptr [[P2]], align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 [[K]], <8 x i64> [[A_I]], <8 x i64> [[B_I]], <8 x i64> [[SRC_I]])
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %call = call <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2)
+  ret <8 x i64> %call
+}
+
+define internal <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #1 {
+  %src = load <8 x i64>, ptr %p0, align 64
+  %a = load <8 x i64>, ptr %p1, align 64
+  %b = load <8 x i64>, ptr %p2, align 64
+  %3 = tail call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 %k, <8 x i64> %a, <8 x i64> %b, <8 x i64> %src) #2
+  ret <8 x i64> %3
+}
+
+attributes #0 = { "min-legal-vector-width"="512" "target-features"="+avx,+avx2,+avx512bw,+avx512dq,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #1 = { "min-legal-vector-width"="512" "target-features"="+avx,+avx2,+avx512bw,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
diff --git a/llvm/test/Transforms/Inline/inline-sign-return-address.ll b/llvm/test/Transforms/Inline/inline-sign-return-address.ll
new file mode 100644
index 00000000000000..c4d85fa671a4f6
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-sign-return-address.ll
@@ -0,0 +1,104 @@
+; Check the inliner doesn't inline a function with different sign return address schemes.
+; RUN: opt < %s -passes=inline -S | FileCheck %s
+
+define internal void @foo_all() #0 {
+  ret void
+}
+
+define internal void @foo_nonleaf() #1 {
+  ret void
+}
+
+define internal void @foo_none() #2 {
+  ret void
+}
+
+define internal void @foo_lr() #3 {
+  ret void
+}
+
+define internal void @foo_bkey() #4 {
+  ret void
+}
+
+define dso_local void @bar_all() #0 {
+; CHECK-LABEL: bar_all
+; CHECK-NOT:     call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_nonleaf() #1 {
+; CHECK-LABEL: bar_nonleaf
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NOT:     call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_none() #2 {
+; CHECK-LABEL: bar_none
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NOT:     call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_lr() #3 {
+; CHECK-LABEL: bar_lr
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NOT:     call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_bkey() #4 {
+; CHECK-LABEL: bar_bkey
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NOT:     call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+
+attributes #0 = { "branch-protection-pauth-lr"="false" "sign-return-address"="all" }
+attributes #1 = { "branch-protection-pauth-lr"="false" "sign-return-address"="non-leaf" }
+attributes #2 = { "branch-protection-pauth-lr"="false" "sign-return-address"="none" }
+attributes #3 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" }
+attributes #4 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" }
\ No newline at end of file
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
index 58bd81297b0dd9..5ace1039c37825 100644
--- a/llvm/test/Transforms/InstCombine/bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -686,6 +686,21 @@ define ptr @bitcast_from_single_element_pointer_vector_to_pointer(<1 x ptr> %ptr
   ret ptr %ptr
 }
 
+; Sure that we calculate the correct shift.
+define <4 x i32> @bitcast_shl(i32 %arg) {
+; CHECK-LABEL: @bitcast_shl(
+; CHECK-NEXT:    [[I5:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 65, i32 poison>, i32 [[ARG:%.*]], i64 3
+; CHECK-NEXT:    ret <4 x i32> [[I5]]
+;
+  %i = zext i32 %arg to i64
+  %i1 = shl i64 %i, 32
+  %i2 = or i64 %i1, 65
+  %i3 = zext i64 %i2 to i128
+  %i4 = shl i128 %i3, 64
+  %i5 = bitcast i128 %i4 to <4 x i32>
+  ret <4 x i32> %i5
+}
+
 declare void @f1()
 declare void @f2()
 define ptr @select_bitcast_unsized_pointer(i1 %c) {
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 642c3eb2a0e41b..c90b6c9fb29592 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -116,6 +116,7 @@ define void @test_overaligned_vec(i8 %B) {
 ; CHECK-LABEL: @test_overaligned_vec(
 ; CHECK-NEXT:    store i8 [[B:%.*]], ptr getelementptr inbounds ([10 x i8], ptr @Global, i64 0, i64 2), align 1
 ; CHECK-NEXT:    ret void
+;
   %A = getelementptr <2 x half>, ptr @Global, i64 0, i64 1
   store i8 %B, ptr %A
   ret void
@@ -1473,6 +1474,16 @@ define ptr @gep_sdiv(ptr %p, i64 %off) {
   ret ptr %ptr
 }
 
+define ptr @gep_udiv(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_udiv(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[OFF:%.*]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = udiv exact i64 %off, 7
+  %ptr = getelementptr %struct.C, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
 define <2 x ptr> @gep_sdiv_vec(<2 x ptr> %p, <2 x i64> %off) {
 ; CHECK-LABEL: @gep_sdiv_vec(
 ; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, <2 x ptr> [[P:%.*]], <2 x i64> [[OFF:%.*]]
@@ -1503,6 +1514,16 @@ define ptr @gep_ashr(ptr %p, i64 %off) {
   ret ptr %ptr
 }
 
+define ptr @gep_lshr(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_lshr(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[OFF:%.*]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = lshr exact i64 %off, 2
+  %ptr = getelementptr i32, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
 ; Negative tests
 
 define ptr @gep_i8(ptr %p, i64 %off) {
@@ -1525,6 +1546,17 @@ define ptr @gep_sdiv_mismatched_size(ptr %p, i64 %off) {
   ret ptr %ptr
 }
 
+define ptr @gep_udiv_mismatched_size(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_udiv_mismatched_size(
+; CHECK-NEXT:    [[INDEX:%.*]] = udiv exact i64 [[OFF:%.*]], 20
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = udiv exact i64 %off, 20
+  %ptr = getelementptr %struct.C, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
 define ptr @gep_sdiv_without_exact(ptr %p, i64 %off) {
 ; CHECK-LABEL: @gep_sdiv_without_exact(
 ; CHECK-NEXT:    [[INDEX:%.*]] = sdiv i64 [[OFF:%.*]], 7
@@ -1536,6 +1568,17 @@ define ptr @gep_sdiv_without_exact(ptr %p, i64 %off) {
   ret ptr %ptr
 }
 
+define ptr @gep_udiv_without_exact(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_udiv_without_exact(
+; CHECK-NEXT:    [[INDEX:%.*]] = udiv i64 [[OFF:%.*]], 7
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = udiv i64 %off, 7
+  %ptr = getelementptr %struct.C, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
 define ptr @gep_ashr_without_exact(ptr %p, i64 %off) {
 ; CHECK-LABEL: @gep_ashr_without_exact(
 ; CHECK-NEXT:    [[INDEX:%.*]] = ashr i64 [[OFF:%.*]], 2
@@ -1547,6 +1590,17 @@ define ptr @gep_ashr_without_exact(ptr %p, i64 %off) {
   ret ptr %ptr
 }
 
+define ptr @gep_lshr_without_exact(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_lshr_without_exact(
+; CHECK-NEXT:    [[INDEX:%.*]] = lshr i64 [[OFF:%.*]], 2
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = lshr i64 %off, 2
+  %ptr = getelementptr i32, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
 define i1 @test_only_used_by_icmp(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test_only_used_by_icmp(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[B:%.*]], [[C:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
index a203b28bcb82a8..f37226bbd5b09c 100644
--- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
@@ -240,3 +240,43 @@ define i32 @vec_to_scalar_select_vector(<2 x i1> %b) {
   %c = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %s)
   ret i32 %c
 }
+
+define i8 @test_drop_noundef(i1 %cond, i8 %val) {
+; CHECK-LABEL: @test_drop_noundef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.smin.i8(i8 [[VAL:%.*]], i8 0)
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i8 -1, i8 [[TMP0]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  %sel = select i1 %cond, i8 -1, i8 %val
+  %ret = call noundef i8 @llvm.smin.i8(i8 %sel, i8 0)
+  ret i8 %ret
+}
+
+define i1 @pr85536(i32 %a) {
+; CHECK-LABEL: @pr85536(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[A:%.*]], 31
+; CHECK-NEXT:    [[SHL1:%.*]] = shl nsw i32 -1, [[A]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SHL1]] to i64
+; CHECK-NEXT:    [[SHL2:%.*]] = shl i64 [[ZEXT]], 48
+; CHECK-NEXT:    [[SHR:%.*]] = ashr exact i64 [[SHL2]], 48
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.smin.i64(i64 [[SHR]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 65535
+; CHECK-NEXT:    [[RET1:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[CMP1]], i1 [[RET1]], i1 false
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cmp1 = icmp ugt i32 %a, 30
+  %shl1 = shl nsw i32 -1, %a
+  %zext = zext i32 %shl1 to i64
+  %shl2 = shl i64 %zext, 48
+  %shr = ashr exact i64 %shl2, 48
+  %sel = select i1 %cmp1, i64 -1, i64 %shr
+  %smin = call noundef i64 @llvm.smin.i64(i64 %sel, i64 0)
+  %masked = and i64 %smin, 65535
+  %ret = icmp eq i64 %masked, 0
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
index 2704905f7a358d..c87c1199f727ea 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -292,7 +292,11 @@ entry:
 define void @scatter_nxv4i16_uniform_vals_uniform_ptrs_all_active_mask(ptr %dst, i16 %val) {
 ; CHECK-LABEL: @scatter_nxv4i16_uniform_vals_uniform_ptrs_all_active_mask(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i16 [[VAL:%.*]], ptr [[DST:%.*]], align 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_VALUE:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[VAL:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATVALUE:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_VALUE]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[BROADCAST_SPLATVALUE]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> zeroinitializer, i1 true, i32 0), <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/pr80597.ll b/llvm/test/Transforms/InstCombine/pr80597.ll
new file mode 100644
index 00000000000000..5feae4a06c45c0
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr80597.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define i64 @pr80597(i1 %cond) {
+; CHECK-LABEL: define i64 @pr80597(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = select i1 [[COND]], i64 0, i64 -12884901888
+; CHECK-NEXT:    [[SEXT1:%.*]] = add nsw i64 [[ADD]], 8836839514384105472
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[SEXT1]], -34359738368
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[SEXT2:%.*]] = ashr exact i64 [[ADD]], 1
+; CHECK-NEXT:    [[ASHR:%.*]] = or i64 [[SEXT2]], 4418419761487020032
+; CHECK-NEXT:    ret i64 [[ASHR]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %add = select i1 %cond, i64 0, i64 4294967293
+  %add8 = shl i64 %add, 32
+  %sext1 = add i64 %add8, 8836839514384105472
+  %cmp = icmp ult i64 %sext1, -34359738368
+  br i1 %cmp, label %if.then, label %if.else
+
+if.else:
+  %sext2 = or i64 %add8, 8836839522974040064
+  %ashr = ashr i64 %sext2, 1
+  ret i64 %ashr
+
+if.then:
+  ret i64 0
+}
diff --git a/llvm/test/Transforms/InstCombine/pr80941.ll b/llvm/test/Transforms/InstCombine/pr80941.ll
new file mode 100644
index 00000000000000..95242b1d1407bf
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr80941.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define float @pr80941(float %arg) {
+; CHECK-LABEL: define float @pr80941(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[ARG]], i32 144)
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_EXIT:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[FPEXT:%.*]] = fpext float [[ARG]] to double
+; CHECK-NEXT:    [[SIGN:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[FPEXT]])
+; CHECK-NEXT:    [[FPTRUNC:%.*]] = fptrunc double [[SIGN]] to float
+; CHECK-NEXT:    br label [[IF_EXIT]]
+; CHECK:       if.exit:
+; CHECK-NEXT:    [[RET:%.*]] = phi float [ [[FPTRUNC]], [[IF_THEN]] ], [ [[ARG]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  %cond = tail call i1 @llvm.is.fpclass.f32(float %arg, i32 144)
+  br i1 %cond, label %if.then, label %if.exit
+
+if.then:
+  %fpext = fpext float %arg to double
+  %sign = call double @llvm.copysign.f64(double 0.000000e+00, double %fpext)
+  %fptrunc = fptrunc double %sign to float
+  br label %if.exit
+
+if.exit:
+  %ret = phi float [ %fptrunc, %if.then ], [ %arg, %entry ]
+  ret float %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/pr83931.ll b/llvm/test/Transforms/InstCombine/pr83931.ll
new file mode 100644
index 00000000000000..d36ac8d91abd30
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr83931.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define <vscale x 2 x i1> @dont_crash(<vscale x 2 x i64> %x) {
+; CHECK-LABEL: define <vscale x 2 x i1> @dont_crash(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <vscale x 2 x i64> [[X]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -309383, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[RET]]
+;
+entry:
+  %div = sdiv <vscale x 2 x i64> %x, splat (i64 309383)
+  %ret = icmp sge <vscale x 2 x i64> %div, zeroinitializer
+  ret <vscale x 2 x i1> %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/pr83947.ll b/llvm/test/Transforms/InstCombine/pr83947.ll
new file mode 100644
index 00000000000000..c1d601ff637183
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr83947.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+@c = global i32 0, align 4
+@b = global i32 0, align 4
+
+define void @masked_scatter1() {
+; CHECK-LABEL: define void @masked_scatter1() {
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr @c, i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c), i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> splat (ptr @c), i32 4, <vscale x 4 x i1> splat (i1 icmp eq (ptr getelementptr (i32, ptr @b, i64 1), ptr @c)))
+  ret void
+}
+
+define void @masked_scatter2() {
+; CHECK-LABEL: define void @masked_scatter2() {
+; CHECK-NEXT:    store i32 0, ptr @c, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 true))
+  ret void
+}
+
+define void @masked_scatter3() {
+; CHECK-LABEL: define void @masked_scatter3() {
+; CHECK-NEXT:    store i32 0, ptr @c, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> undef)
+  ret void
+}
+
+define void @masked_scatter4() {
+; CHECK-LABEL: define void @masked_scatter4() {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 false))
+  ret void
+}
+
+define void @masked_scatter5() {
+; CHECK-LABEL: define void @masked_scatter5() {
+; CHECK-NEXT:    store i32 0, ptr @c, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> <i1 true, i1 false>)
+  ret void
+}
+
+define void @masked_scatter6() {
+; CHECK-LABEL: define void @masked_scatter6() {
+; CHECK-NEXT:    store i32 0, ptr @c, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> <i1 undef, i1 false>)
+  ret void
+}
+
+define void @masked_scatter7() {
+; CHECK-LABEL: define void @masked_scatter7() {
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> <ptr @c, ptr @c>, i32 4, <2 x i1> <i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c), i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c)>)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 icmp eq (ptr getelementptr (i32, ptr @b, i64 1), ptr @c)))
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/select-divrem.ll b/llvm/test/Transforms/InstCombine/select-divrem.ll
index f007c53359ca5a..e0c460c37451db 100644
--- a/llvm/test/Transforms/InstCombine/select-divrem.ll
+++ b/llvm/test/Transforms/InstCombine/select-divrem.ll
@@ -343,3 +343,20 @@ define i32 @rem_euclid_pow2_false_arm_folded(i32 %n) {
   %res = select i1 %nonneg, i32 %rem, i32 1
   ret i32 %res
 }
+
+define i8 @pr89516(i8 %n, i8 %x) {
+; CHECK-LABEL: @pr89516(
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i8 [[X:%.*]], 0
+; CHECK-NEXT:    [[POW2:%.*]] = shl nuw i8 1, [[N:%.*]]
+; CHECK-NEXT:    [[SREM:%.*]] = srem i8 1, [[POW2]]
+; CHECK-NEXT:    [[ADD:%.*]] = select i1 [[COND]], i8 [[POW2]], i8 0
+; CHECK-NEXT:    [[RES:%.*]] = add nuw i8 [[SREM]], [[ADD]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %cond = icmp slt i8 %x, 0
+  %pow2 = shl nuw i8 1, %n
+  %srem = srem i8 1, %pow2
+  %add = add nuw i8 %srem, %pow2
+  %res = select i1 %cond, i8 %add, i8 %srem
+  ret i8 %res
+}
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index c5f1b77c6d7404..888e7d28f78afb 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2849,12 +2849,14 @@ define i8 @select_replacement_sub(i8 %x, i8 %y, i8 %z) {
   ret i8 %sel
 }
 
+; FIXME: This is safe to fold.
 define i8 @select_replacement_shift_noundef(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @select_replacement_shift_noundef(
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i8 [[X:%.*]], 1
 ; CHECK-NEXT:    call void @use_i8(i8 noundef [[SHR]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHR]], [[Y:%.*]]
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[Z:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[Y]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[SHL]], i8 [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[SEL]]
 ;
   %shr = lshr exact i8 %x, 1
@@ -2904,6 +2906,40 @@ define i32 @select_replacement_loop2(i32 %arg, i32 %arg2) {
   ret i32 %sel
 }
 
+define i8 @select_replacement_loop3(i32 noundef %x) {
+; CHECK-LABEL: @select_replacement_loop3(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-NEXT:    [[REV:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[TRUNC]])
+; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[REV]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[EXT]], [[X]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[TRUNC]], i8 0
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %trunc = trunc i32 %x to i8
+  %rev = call i8 @llvm.bitreverse.i8(i8 %trunc)
+  %ext = zext i8 %rev to i32
+  %cmp = icmp eq i32 %ext, %x
+  %sel = select i1 %cmp, i8 %trunc, i8 0
+  ret i8 %sel
+}
+
+define i16 @select_replacement_loop4(i16 noundef %p_12) {
+; CHECK-LABEL: @select_replacement_loop4(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[P_12:%.*]], 2
+; CHECK-NEXT:    [[AND1:%.*]] = and i16 [[P_12]], 1
+; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[CMP1]], i16 [[AND1]], i16 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i16 [[AND2]], [[P_12]]
+; CHECK-NEXT:    [[AND3:%.*]] = select i1 [[CMP2]], i16 [[AND1]], i16 0
+; CHECK-NEXT:    ret i16 [[AND3]]
+;
+  %cmp1 = icmp ult i16 %p_12, 2
+  %and1 = and i16 %p_12, 1
+  %and2 = select i1 %cmp1, i16 %and1, i16 0
+  %cmp2 = icmp eq i16 %and2, %p_12
+  %and3 = select i1 %cmp2, i16 %and1, i16 0
+  ret i16 %and3
+}
+
 define ptr @select_replacement_gep_inbounds(ptr %base, i64 %offset) {
 ; CHECK-LABEL: @select_replacement_gep_inbounds(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]]
@@ -3423,7 +3459,7 @@ define <vscale x 2 x i32> @scalable_sign_bits(<vscale x 2 x i8> %x) {
 define <vscale x 2 x i1> @scalable_non_zero(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: @scalable_non_zero(
 ; CHECK-NEXT:    [[A:%.*]] = or <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 56, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 57, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[CMP]]
 ;
   %a = or <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/InstCombine/vscale_cmp.ll b/llvm/test/Transforms/InstCombine/vscale_cmp.ll
index a7f8368c5d62c8..b2bfc93da089fc 100644
--- a/llvm/test/Transforms/InstCombine/vscale_cmp.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_cmp.ll
@@ -3,7 +3,7 @@
 
 define <vscale x 2 x i1> @sge(<vscale x 2 x i8> %x) {
 ; CHECK-LABEL: @sge(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge <vscale x 2 x i8> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <vscale x 2 x i8> [[X:%.*]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -1, i64 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[CMP]]
 ;
   %cmp = icmp sge <vscale x 2 x i8> %x, zeroinitializer
diff --git a/llvm/test/Transforms/InstSimplify/pr87042.ll b/llvm/test/Transforms/InstSimplify/pr87042.ll
new file mode 100644
index 00000000000000..800d27c9e65043
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/pr87042.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+; %or2 cannot be folded into %or1 because %or1 has disjoint.
+; TODO: Can we move the logic into InstCombine and drop the disjoint flag?
+define i64 @test(i1 %cond, i64 %x) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: i1 [[COND:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[OR1:%.*]] = or disjoint i64 [[X]], 7
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[COND]], i64 [[OR1]], i64 [[X]]
+; CHECK-NEXT:    [[OR2:%.*]] = or i64 [[SEL1]], 7
+; CHECK-NEXT:    ret i64 [[OR2]]
+;
+  %or1 = or disjoint i64 %x, 7
+  %sel1 = select i1 %cond, i64 %or1, i64 %x
+  %or2 = or i64 %sel1, 7
+  ret i64 %or2
+}
+
+define i64 @pr87042(i64 %x) {
+; CHECK-LABEL: define i64 @pr87042(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[X]], 65535
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 [[AND1]], 0
+; CHECK-NEXT:    [[OR1:%.*]] = or disjoint i64 [[X]], 7
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP1]], i64 [[OR1]], i64 [[X]]
+; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[SEL1]], 16776960
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[AND2]], 0
+; CHECK-NEXT:    [[OR2:%.*]] = or i64 [[SEL1]], 7
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP2]], i64 [[OR2]], i64 [[SEL1]]
+; CHECK-NEXT:    ret i64 [[SEL2]]
+;
+  %and1 = and i64 %x, 65535
+  %cmp1 = icmp eq i64 %and1, 0
+  %or1 = or disjoint i64 %x, 7
+  %sel1 = select i1 %cmp1, i64 %or1, i64 %x
+  %and2 = and i64 %sel1, 16776960
+  %cmp2 = icmp eq i64 %and2, 0
+  %or2 = or i64 %sel1, 7
+  %sel2 = select i1 %cmp2, i64 %or2, i64 %sel1
+  ret i64 %sel2
+}
diff --git a/llvm/test/Transforms/JumpThreading/pr79175.ll b/llvm/test/Transforms/JumpThreading/pr79175.ll
new file mode 100644
index 00000000000000..2c7ee0770cdc73
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/pr79175.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=jump-threading < %s | FileCheck %s
+
+@f = external global i32
+
+; Make sure the value of @f is reloaded prior to the final comparison.
+define i32 @test(i64 %idx, i32 %val) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i64 [[IDX:%.*]], i32 [[VAL:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[IDX]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[RETURN:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[F:%.*]] = load i32, ptr @f, align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[F]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[COND_END_THREAD:%.*]], label [[COND_END:%.*]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[VAL]], 0
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[CMP_I]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[COND_END_THREAD]], label [[TMP0:%.*]]
+; CHECK:       cond.end.thread:
+; CHECK-NEXT:    br label [[TMP0]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 0, [[COND_END_THREAD]] ], [ [[VAL]], [[COND_END]] ]
+; CHECK-NEXT:    [[F_IDX:%.*]] = getelementptr inbounds i32, ptr @f, i64 [[IDX]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[F_IDX]], align 4
+; CHECK-NEXT:    [[F_RELOAD:%.*]] = load i32, ptr @f, align 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[F_RELOAD]], 1
+; CHECK-NEXT:    br i1 [[CMP3]], label [[RETURN2:%.*]], label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       return2:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %cmp = icmp slt i64 %idx, 1
+  br i1 %cmp, label %for.body, label %return
+
+for.body:
+  %f = load i32, ptr @f, align 4
+  %cmp1 = icmp eq i32 %f, 0
+  br i1 %cmp1, label %cond.end, label %cond.false
+
+cond.false:
+  br label %cond.end
+
+cond.end:
+  %phi = phi i32 [ %val, %cond.false ], [ 1, %for.body ]
+  %cmp.i = icmp sgt i32 %phi, 0
+  %sel = select i1 %cmp.i, i32 0, i32 %phi
+  %f.idx = getelementptr inbounds i32, ptr @f, i64 %idx
+  store i32 %sel, ptr %f.idx, align 4
+  %f.reload = load i32, ptr @f, align 4
+  %cmp3 = icmp slt i32 %f.reload, 1
+  br i1 %cmp3, label %return2, label %return
+
+return:
+  ret i32 0
+
+return2:
+  ret i32 1
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 00000000000000..8a796bb3065b19
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,217 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %off, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'no_outer_loop'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK-NOT:  We expect runtime memory checks to be hoisted out of the outer loop.
+; CHECK:      Total cost of runtime checks: 4
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %entry ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %off
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  ret void
+}
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
+; CHECK:      Total cost of runtime checks: 3
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK:      Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
+; CHECK:      Total cost of runtime checks: 1
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK:      Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK:      Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
+  %0 = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
+  %1 = add nuw nsw i64 %iv.inner, %0
+  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
+  %2 = load i32, ptr %arrayidx.us, align 4
+  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
+  %3 = load i32, ptr %arrayidx8.us, align 4
+  %add9.us = add nsw i32 %3, %2
+  store i32 %add9.us, ptr %arrayidx8.us, align 4
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
+  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
+  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 10, i32 20}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
index d07f72792e6b99..dd1495626eb984 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(cos|sin|tan|cbrt|erf|exp[^e]|gamma|log|sqrt|copysign|dim|min|mod|hypot|nextafter|pow|fma)" --version 2
-; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
-; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
-; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=ARMPL-NEON
-; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=ARMPL-SVE
+; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=SLEEF-NEON
+; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=SLEEF-SVE
+; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefixes=SLEEF-SVE-NOPRED
+; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=ARMPL-NEON
+; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=ARMPL-SVE
+; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefixes=ARMPL-SVE-NOPRED
+
+
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -23,6 +27,11 @@ define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @acos_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @acos(double [[IN:%.*]]) #[[ATTR2:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @acos_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vacosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -30,6 +39,11 @@ define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @acos_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svacos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @acos_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svacos_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @acos(double [[IN:%.*]]) #[[ATTR2:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -58,6 +72,11 @@ define void @acos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @acos_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @acosf(float [[IN:%.*]]) #[[ATTR3:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @acos_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vacosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -65,6 +84,11 @@ define void @acos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @acos_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svacos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @acos_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svacos_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @acosf(float [[IN:%.*]]) #[[ATTR3:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -96,6 +120,11 @@ define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @acosh_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acosh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @acosh(double [[IN:%.*]]) #[[ATTR4:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @acosh_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vacoshq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -103,6 +132,11 @@ define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @acosh_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svacosh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @acosh_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svacosh_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @acosh(double [[IN:%.*]]) #[[ATTR4:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -131,6 +165,11 @@ define void @acosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acoshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @acosh_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acoshf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @acoshf(float [[IN:%.*]]) #[[ATTR5:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @acosh_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vacoshq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -138,6 +177,11 @@ define void @acosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @acosh_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svacosh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @acosh_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svacosh_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @acoshf(float [[IN:%.*]]) #[[ATTR5:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -169,6 +213,11 @@ define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @asin_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @asin(double [[IN:%.*]]) #[[ATTR6:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @asin_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vasinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -176,6 +225,11 @@ define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @asin_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svasin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @asin_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svasin_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @asin(double [[IN:%.*]]) #[[ATTR6:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -204,6 +258,11 @@ define void @asin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @asin_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @asinf(float [[IN:%.*]]) #[[ATTR7:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @asin_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vasinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -211,6 +270,11 @@ define void @asin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @asin_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svasin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @asin_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svasin_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @asinf(float [[IN:%.*]]) #[[ATTR7:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -242,6 +306,11 @@ define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @asinh_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asinh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @asinh(double [[IN:%.*]]) #[[ATTR8:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @asinh_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vasinhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -249,6 +318,11 @@ define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @asinh_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svasinh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @asinh_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svasinh_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @asinh(double [[IN:%.*]]) #[[ATTR8:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -277,6 +351,11 @@ define void @asinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @asinh_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @asinhf(float [[IN:%.*]]) #[[ATTR9:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @asinh_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vasinhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -284,6 +363,11 @@ define void @asinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @asinh_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svasinh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @asinh_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svasinh_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @asinhf(float [[IN:%.*]]) #[[ATTR9:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -315,6 +399,11 @@ define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @atan_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @atan(double [[IN:%.*]]) #[[ATTR10:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @atan_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatanq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -322,6 +411,11 @@ define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @atan_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatan_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @atan_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svatan_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @atan(double [[IN:%.*]]) #[[ATTR10:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -350,6 +444,11 @@ define void @atan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @atan_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @atanf(float [[IN:%.*]]) #[[ATTR11:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @atan_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatanq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -357,6 +456,11 @@ define void @atan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @atan_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatan_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @atan_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svatan_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @atanf(float [[IN:%.*]]) #[[ATTR11:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -388,6 +492,11 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @atan2_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @atan2(double [[IN:%.*]], double [[IN]]) #[[ATTR12:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @atan2_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatan2q_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -395,6 +504,11 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @atan2_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatan2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @atan2_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svatan2_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @atan2(double [[IN:%.*]], double [[IN]]) #[[ATTR12:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -423,6 +537,11 @@ define void @atan2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @atan2_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @atan2f(float [[IN:%.*]], float [[IN]]) #[[ATTR13:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @atan2_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatan2q_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -430,6 +549,11 @@ define void @atan2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @atan2_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatan2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @atan2_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svatan2_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @atan2f(float [[IN:%.*]], float [[IN]]) #[[ATTR13:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -461,6 +585,11 @@ define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @atanh_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @atanh(double [[IN:%.*]]) #[[ATTR14:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @atanh_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatanhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -468,6 +597,11 @@ define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @atanh_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatanh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @atanh_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svatanh_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @atanh(double [[IN:%.*]]) #[[ATTR14:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -496,6 +630,11 @@ define void @atanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @atanh_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @atanhf(float [[IN:%.*]]) #[[ATTR15:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @atanh_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatanhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -503,6 +642,11 @@ define void @atanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @atanh_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatanh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @atanh_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svatanh_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @atanhf(float [[IN:%.*]]) #[[ATTR15:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -534,6 +678,11 @@ define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cbrt(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @cbrt_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cbrt(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @cbrt(double [[IN:%.*]]) #[[ATTR16:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @cbrt_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcbrtq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -541,6 +690,11 @@ define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @cbrt_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcbrt_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @cbrt_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svcbrt_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @cbrt(double [[IN:%.*]]) #[[ATTR16:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -569,6 +723,11 @@ define void @cbrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cbrtf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @cbrt_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cbrtf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @cbrtf(float [[IN:%.*]]) #[[ATTR17:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @cbrt_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcbrtq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -576,6 +735,11 @@ define void @cbrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @cbrt_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcbrt_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @cbrt_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svcbrt_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @cbrtf(float [[IN:%.*]]) #[[ATTR17:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -607,6 +771,11 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_copysign(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @copysign_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_copysign(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]]) #[[ATTR18:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @copysign_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcopysignq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -614,6 +783,11 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @copysign_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcopysign_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @copysign_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svcopysign_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]]) #[[ATTR18:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -642,6 +816,11 @@ define void @copysign_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_copysignf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @copysign_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_copysignf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]]) #[[ATTR19:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @copysign_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcopysignq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -649,6 +828,11 @@ define void @copysign_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @copysign_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcopysign_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @copysign_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svcopysign_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]]) #[[ATTR19:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -680,6 +864,11 @@ define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @cos_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @cos(double [[IN:%.*]]) #[[ATTR20:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @cos_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -687,6 +876,11 @@ define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @cos_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @cos_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @cos(double [[IN:%.*]]) #[[ATTR20:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -715,6 +909,11 @@ define void @cos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @cos_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @cosf(float [[IN:%.*]]) #[[ATTR21:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @cos_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -722,6 +921,11 @@ define void @cos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @cos_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @cos_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @cosf(float [[IN:%.*]]) #[[ATTR21:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -753,6 +957,11 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @cosh_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @cosh(double [[IN:%.*]]) #[[ATTR22:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @cosh_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcoshq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -760,6 +969,11 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @cosh_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcosh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @cosh_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svcosh_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @cosh(double [[IN:%.*]]) #[[ATTR22:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -788,6 +1002,11 @@ define void @cosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @cosh_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @coshf(float [[IN:%.*]]) #[[ATTR23:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @cosh_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcoshq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -795,6 +1014,11 @@ define void @cosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @cosh_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcosh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @cosh_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svcosh_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @coshf(float [[IN:%.*]]) #[[ATTR23:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -826,6 +1050,11 @@ define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cospi(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @cospi_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cospi(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]]) #[[ATTR24:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @cospi_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcospiq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -833,6 +1062,11 @@ define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @cospi_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcospi_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @cospi_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svcospi_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]]) #[[ATTR24:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -861,6 +1095,11 @@ define void @cospi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cospif(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @cospi_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cospif(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]]) #[[ATTR25:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @cospi_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcospiq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -868,6 +1107,11 @@ define void @cospi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @cospi_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcospi_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @cospi_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svcospi_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]]) #[[ATTR25:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -899,6 +1143,11 @@ define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erf(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @erf_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erf(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @erf(double [[IN:%.*]]) #[[ATTR26:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @erf_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_verfq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -906,6 +1155,11 @@ define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @erf_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_sverf_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @erf_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_sverf_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @erf(double [[IN:%.*]]) #[[ATTR26:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -934,6 +1188,11 @@ define void @erf_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erff(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @erf_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erff(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @erff(float [[IN:%.*]]) #[[ATTR27:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @erf_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_verfq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -941,6 +1200,11 @@ define void @erf_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @erf_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_sverf_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @erf_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_sverf_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @erff(float [[IN:%.*]]) #[[ATTR27:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -972,6 +1236,11 @@ define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erfc(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @erfc_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erfc(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @erfc(double [[IN:%.*]]) #[[ATTR28:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @erfc_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_verfcq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -979,6 +1248,11 @@ define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @erfc_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_sverfc_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @erfc_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_sverfc_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @erfc(double [[IN:%.*]]) #[[ATTR28:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1007,6 +1281,11 @@ define void @erfc_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erfcf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @erfc_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erfcf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @erfcf(float [[IN:%.*]]) #[[ATTR29:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @erfc_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_verfcq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1014,6 +1293,11 @@ define void @erfc_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @erfc_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_sverfc_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @erfc_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_sverfc_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @erfcf(float [[IN:%.*]]) #[[ATTR29:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1045,6 +1329,11 @@ define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @exp_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @exp(double [[IN:%.*]]) #[[ATTR30:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @exp_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexpq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1052,6 +1341,11 @@ define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @exp_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @exp_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @exp(double [[IN:%.*]]) #[[ATTR30:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1080,6 +1374,11 @@ define void @exp_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @exp_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @expf(float [[IN:%.*]]) #[[ATTR31:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @exp_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexpq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1087,6 +1386,11 @@ define void @exp_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @exp_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @exp_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @expf(float [[IN:%.*]]) #[[ATTR31:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1118,6 +1422,11 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @exp10_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @exp10(double [[IN:%.*]]) #[[ATTR32:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @exp10_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1125,6 +1434,11 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @exp10_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @exp10_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @exp10(double [[IN:%.*]]) #[[ATTR32:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1153,6 +1467,11 @@ define void @exp10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @exp10_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @exp10f(float [[IN:%.*]]) #[[ATTR33:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @exp10_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1160,6 +1479,11 @@ define void @exp10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @exp10_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @exp10_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @exp10f(float [[IN:%.*]]) #[[ATTR33:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1191,6 +1515,11 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @exp2_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @exp2(double [[IN:%.*]]) #[[ATTR34:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @exp2_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1198,6 +1527,11 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @exp2_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @exp2_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @exp2(double [[IN:%.*]]) #[[ATTR34:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1226,6 +1560,11 @@ define void @exp2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @exp2_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @exp2f(float [[IN:%.*]]) #[[ATTR35:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @exp2_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1233,6 +1572,11 @@ define void @exp2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @exp2_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @exp2_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @exp2f(float [[IN:%.*]]) #[[ATTR35:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1264,6 +1608,11 @@ define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_expm1(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @expm1_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_expm1(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @expm1(double [[IN:%.*]]) #[[ATTR36:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @expm1_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexpm1q_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1271,6 +1620,11 @@ define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @expm1_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexpm1_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @expm1_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svexpm1_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @expm1(double [[IN:%.*]]) #[[ATTR36:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1299,6 +1653,11 @@ define void @expm1_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expm1f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @expm1_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expm1f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @expm1f(float [[IN:%.*]]) #[[ATTR37:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @expm1_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexpm1q_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1306,6 +1665,11 @@ define void @expm1_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @expm1_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexpm1_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @expm1_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svexpm1_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @expm1f(float [[IN:%.*]]) #[[ATTR37:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1337,6 +1701,11 @@ define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fdim(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fdim_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fdim(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]]) #[[ATTR38:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fdim_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfdimq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1344,6 +1713,11 @@ define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fdim_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfdim_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fdim_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svfdim_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]]) #[[ATTR38:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1372,6 +1746,11 @@ define void @fdim_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fdimf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fdim_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fdimf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]]) #[[ATTR39:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fdim_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfdimq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1379,6 +1758,11 @@ define void @fdim_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fdim_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfdim_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fdim_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svfdim_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]]) #[[ATTR39:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1410,6 +1794,11 @@ define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvvv_fma(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fma_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvvv_fma(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]]) #[[ATTR40:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fma_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfmaq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
@@ -1417,6 +1806,11 @@ define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fma_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfma_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fma_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svfma_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]]) #[[ATTR40:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1445,6 +1839,11 @@ define void @fma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvvv_fmaf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fma_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvvv_fmaf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]]) #[[ATTR41:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fma_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfmaq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
@@ -1452,6 +1851,11 @@ define void @fma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fma_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfma_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fma_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svfma_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]]) #[[ATTR41:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1483,6 +1887,11 @@ define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmax(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fmax_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmax(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]]) #[[ATTR42:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fmax_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfmaxq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1490,6 +1899,11 @@ define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fmax_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfmax_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fmax_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svfmax_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]]) #[[ATTR42:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1518,6 +1932,11 @@ define void @fmax_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fmaxf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fmax_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fmaxf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]]) #[[ATTR43:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fmax_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfmaxq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1525,6 +1944,11 @@ define void @fmax_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fmax_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfmax_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fmax_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svfmax_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]]) #[[ATTR43:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1556,6 +1980,11 @@ define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fmin_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmin(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]]) #[[ATTR44:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fmin_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfminq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1563,6 +1992,11 @@ define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fmin_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfmin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fmin_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svfmin_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]]) #[[ATTR44:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1591,6 +2025,11 @@ define void @fmin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fminf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fmin_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fminf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]]) #[[ATTR45:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fmin_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfminq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1598,6 +2037,11 @@ define void @fmin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fmin_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfmin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fmin_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svfmin_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]]) #[[ATTR45:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1629,6 +2073,11 @@ define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmod(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fmod_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmod(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]]) #[[ATTR46:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fmod_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfmodq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1636,6 +2085,11 @@ define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fmod_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfmod_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fmod_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svfmod_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]]) #[[ATTR46:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1664,6 +2118,11 @@ define void @fmod_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fmodf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @fmod_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fmodf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]]) #[[ATTR47:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @fmod_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfmodq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1671,6 +2130,11 @@ define void @fmod_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @fmod_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfmod_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @fmod_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svfmod_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]]) #[[ATTR47:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1702,6 +2166,11 @@ define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_hypot(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @hypot_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_hypot(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @hypot(double [[IN:%.*]], double [[IN]]) #[[ATTR48:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @hypot_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vhypotq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1709,6 +2178,11 @@ define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @hypot_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svhypot_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @hypot_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svhypot_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @hypot(double [[IN:%.*]], double [[IN]]) #[[ATTR48:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1737,6 +2211,11 @@ define void @hypot_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_hypotf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @hypot_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_hypotf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @hypotf(float [[IN:%.*]], float [[IN]]) #[[ATTR49:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @hypot_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vhypotq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1744,6 +2223,11 @@ define void @hypot_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @hypot_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svhypot_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @hypot_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svhypot_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @hypotf(float [[IN:%.*]], float [[IN]]) #[[ATTR49:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1775,6 +2259,11 @@ define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x i32> @_ZGVsMxv_ilogb(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @ilogb_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x i32> @_ZGVsMxv_ilogb(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]]) #[[ATTR50:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @ilogb_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x i32> @armpl_vilogbq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1782,6 +2271,11 @@ define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @ilogb_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x i32> @armpl_svilogb_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @ilogb_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x i32> @armpl_svilogb_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]]) #[[ATTR50:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1810,6 +2304,11 @@ define void @ilogb_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x i32> @_ZGVsMxv_ilogbf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @ilogb_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x i32> @_ZGVsMxv_ilogbf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]]) #[[ATTR51:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @ilogb_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x i32> @armpl_vilogbq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1817,6 +2316,11 @@ define void @ilogb_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @ilogb_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x i32> @armpl_svilogb_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @ilogb_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x i32> @armpl_svilogb_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]]) #[[ATTR51:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1848,6 +2352,11 @@ define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %
 ; SLEEF-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP17:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_ldexp(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i32> [[WIDE_MASKED_LOAD1:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @ldexp_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP11:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_ldexp(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i32> [[WIDE_LOAD1:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR52:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @ldexp_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP5:%.*]] = call <2 x double> @armpl_vldexpq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x i32> [[WIDE_LOAD1:%.*]])
@@ -1855,6 +2364,11 @@ define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %
 ; ARMPL-SVE-LABEL: define void @ldexp_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP17:%.*]] = call <vscale x 2 x double> @armpl_svldexp_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i32> [[WIDE_MASKED_LOAD1:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @ldexp_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP11:%.*]] = call <vscale x 2 x double> @armpl_svldexp_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i32> [[WIDE_LOAD1:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR52:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1885,6 +2399,11 @@ define void @ldexp_f32(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %
 ; SLEEF-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP17:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_ldexpf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @ldexp_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP11:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_ldexpf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i32> [[WIDE_LOAD1:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR53:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @ldexp_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP5:%.*]] = call <4 x float> @armpl_vldexpq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x i32> [[WIDE_LOAD1:%.*]])
@@ -1892,6 +2411,11 @@ define void @ldexp_f32(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %
 ; ARMPL-SVE-LABEL: define void @ldexp_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP17:%.*]] = call <vscale x 4 x float> @armpl_svldexp_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @ldexp_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP11:%.*]] = call <vscale x 4 x float> @armpl_svldexp_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i32> [[WIDE_LOAD1:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR53:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1925,6 +2449,11 @@ define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_lgamma(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @lgamma_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_lgamma(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]]) #[[ATTR54:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @lgamma_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlgammaq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1932,6 +2461,11 @@ define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @lgamma_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlgamma_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @lgamma_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svlgamma_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]]) #[[ATTR54:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1960,6 +2494,11 @@ define void @lgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_lgammaf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @lgamma_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_lgammaf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]]) #[[ATTR55:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @lgamma_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlgammaq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1967,6 +2506,11 @@ define void @lgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @lgamma_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlgamma_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @lgamma_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svlgamma_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]]) #[[ATTR55:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -1998,6 +2542,11 @@ define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @log_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @log(double [[IN:%.*]]) #[[ATTR56:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @log_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2005,6 +2554,11 @@ define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @log_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @log_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @log(double [[IN:%.*]]) #[[ATTR56:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2033,6 +2587,11 @@ define void @log_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @log_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @logf(float [[IN:%.*]]) #[[ATTR57:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @log_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2040,6 +2599,11 @@ define void @log_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @log_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @log_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @logf(float [[IN:%.*]]) #[[ATTR57:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2071,6 +2635,11 @@ define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @log10_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @log10(double [[IN:%.*]]) #[[ATTR58:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @log10_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2078,6 +2647,11 @@ define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @log10_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @log10_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @log10(double [[IN:%.*]]) #[[ATTR58:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2106,6 +2680,11 @@ define void @log10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @log10_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @log10f(float [[IN:%.*]]) #[[ATTR59:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @log10_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2113,6 +2692,11 @@ define void @log10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @log10_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @log10_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @log10f(float [[IN:%.*]]) #[[ATTR59:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2144,6 +2728,11 @@ define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log1p(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @log1p_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log1p(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @log1p(double [[IN:%.*]]) #[[ATTR60:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @log1p_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog1pq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2151,6 +2740,11 @@ define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @log1p_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog1p_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @log1p_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svlog1p_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @log1p(double [[IN:%.*]]) #[[ATTR60:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2179,6 +2773,11 @@ define void @log1p_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log1pf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @log1p_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log1pf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @log1pf(float [[IN:%.*]]) #[[ATTR61:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @log1p_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog1pq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2186,6 +2785,11 @@ define void @log1p_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @log1p_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog1p_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @log1p_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svlog1p_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @log1pf(float [[IN:%.*]]) #[[ATTR61:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2217,6 +2821,11 @@ define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @log2_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @log2(double [[IN:%.*]]) #[[ATTR62:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @log2_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2224,6 +2833,11 @@ define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @log2_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @log2_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @log2(double [[IN:%.*]]) #[[ATTR62:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2252,6 +2866,11 @@ define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @log2_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @log2f(float [[IN:%.*]]) #[[ATTR63:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @log2_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2259,6 +2878,11 @@ define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @log2_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @log2_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @log2f(float [[IN:%.*]]) #[[ATTR63:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2288,7 +2912,12 @@ define void @modf_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; SLEEF-SVE-LABEL: define void @modf_f64
 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; SLEEF-SVE:    [[TMP23:%.*]] = call <vscale x 2 x double> @_ZGVsMxvl8_modf(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SLEEF-SVE:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
+;
+; SLEEF-SVE-NOPRED-LABEL: define void @modf_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP17:%.*]] = call <vscale x 2 x double> @_ZGVsNxvl8_modf(<vscale x 2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]])
+; SLEEF-SVE-NOPRED:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR64:[0-9]+]]
 ;
 ; ARMPL-NEON-LABEL: define void @modf_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
@@ -2296,7 +2925,11 @@ define void @modf_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; ARMPL-SVE-LABEL: define void @modf_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[TMP23:%.*]] = call <vscale x 2 x double> @armpl_svmodf_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; ARMPL-SVE:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @modf_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP5:%.*]] = call <2 x double> @armpl_vmodfq_f64(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -2324,7 +2957,12 @@ define void @modf_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; SLEEF-SVE-LABEL: define void @modf_f32
 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; SLEEF-SVE:    [[TMP23:%.*]] = call <vscale x 4 x float> @_ZGVsMxvl4_modff(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SLEEF-SVE:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
+;
+; SLEEF-SVE-NOPRED-LABEL: define void @modf_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP17:%.*]] = call <vscale x 4 x float> @_ZGVsNxvl4_modff(<vscale x 4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]])
+; SLEEF-SVE-NOPRED:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR65:[0-9]+]]
 ;
 ; ARMPL-NEON-LABEL: define void @modf_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
@@ -2332,7 +2970,11 @@ define void @modf_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; ARMPL-SVE-LABEL: define void @modf_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[TMP23:%.*]] = call <vscale x 4 x float> @armpl_svmodf_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; ARMPL-SVE:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @modf_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP5:%.*]] = call <4 x float> @armpl_vmodfq_f32(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]])
 ;
 entry:
   br label %for.body
@@ -2365,6 +3007,11 @@ define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_nextafter(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @nextafter_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_nextafter(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]]) #[[ATTR66:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @nextafter_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vnextafterq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2372,6 +3019,11 @@ define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @nextafter_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svnextafter_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @nextafter_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svnextafter_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]]) #[[ATTR64:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2400,6 +3052,11 @@ define void @nextafter_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_nextafterf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @nextafter_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_nextafterf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]]) #[[ATTR67:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @nextafter_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vnextafterq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2407,6 +3064,11 @@ define void @nextafter_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @nextafter_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svnextafter_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @nextafter_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svnextafter_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]]) #[[ATTR65:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2438,6 +3100,11 @@ define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @pow_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @pow(double [[IN:%.*]], double [[IN]]) #[[ATTR68:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @pow_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2445,6 +3112,11 @@ define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @pow_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @pow_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @pow(double [[IN:%.*]], double [[IN]]) #[[ATTR66:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2473,6 +3145,11 @@ define void @pow_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @pow_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @powf(float [[IN:%.*]], float [[IN]]) #[[ATTR69:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @pow_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2480,6 +3157,11 @@ define void @pow_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @pow_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @pow_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @powf(float [[IN:%.*]], float [[IN]]) #[[ATTR67:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2511,6 +3193,11 @@ define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @sin_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @sin(double [[IN:%.*]]) #[[ATTR70:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @sin_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2518,6 +3205,11 @@ define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @sin_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sin_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @sin(double [[IN:%.*]]) #[[ATTR68:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2546,6 +3238,11 @@ define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @sin_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @sinf(float [[IN:%.*]]) #[[ATTR71:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @sin_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2553,6 +3250,11 @@ define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @sin_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sin_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @sinf(float [[IN:%.*]]) #[[ATTR69:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2582,7 +3284,12 @@ define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; SLEEF-SVE-LABEL: define void @sincos_f64
 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; SLEEF-SVE:    call void @_ZGVsMxvl8l8_sincos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SLEEF-SVE:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR6:[0-9]+]]
+;
+; SLEEF-SVE-NOPRED-LABEL: define void @sincos_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    call void @_ZGVsNxvl8l8_sincos(<vscale x 2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]])
+; SLEEF-SVE-NOPRED:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR72:[0-9]+]]
 ;
 ; ARMPL-NEON-LABEL: define void @sincos_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
@@ -2590,7 +3297,11 @@ define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; ARMPL-SVE-LABEL: define void @sincos_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    call void @armpl_svsincos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; ARMPL-SVE:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR6:[0-9]+]]
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sincos_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    call void @armpl_vsincosq_f64(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
 ;
 entry:
   br label %for.body
@@ -2617,7 +3328,12 @@ define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; SLEEF-SVE-LABEL: define void @sincos_f32
 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; SLEEF-SVE:    call void @_ZGVsMxvl4l4_sincosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SLEEF-SVE:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR7:[0-9]+]]
+;
+; SLEEF-SVE-NOPRED-LABEL: define void @sincos_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    call void @_ZGVsNxvl4l4_sincosf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]])
+; SLEEF-SVE-NOPRED:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR73:[0-9]+]]
 ;
 ; ARMPL-NEON-LABEL: define void @sincos_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
@@ -2625,7 +3341,11 @@ define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; ARMPL-SVE-LABEL: define void @sincos_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    call void @armpl_svsincos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; ARMPL-SVE:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR7:[0-9]+]]
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sincos_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    call void @armpl_vsincosq_f32(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
 ;
 entry:
   br label %for.body
@@ -2655,7 +3375,12 @@ define void @sincospi_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; SLEEF-SVE-LABEL: define void @sincospi_f64
 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; SLEEF-SVE:    call void @_ZGVsMxvl8l8_sincospi(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SLEEF-SVE:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR8:[0-9]+]]
+;
+; SLEEF-SVE-NOPRED-LABEL: define void @sincospi_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    call void @_ZGVsNxvl8l8_sincospi(<vscale x 2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]])
+; SLEEF-SVE-NOPRED:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR74:[0-9]+]]
 ;
 ; ARMPL-NEON-LABEL: define void @sincospi_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
@@ -2663,7 +3388,11 @@ define void @sincospi_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; ARMPL-SVE-LABEL: define void @sincospi_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    call void @armpl_svsincospi_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; ARMPL-SVE:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR8:[0-9]+]]
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sincospi_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    call void @armpl_vsincospiq_f64(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
 ;
 entry:
   br label %for.body
@@ -2690,7 +3419,12 @@ define void @sincospi_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; SLEEF-SVE-LABEL: define void @sincospi_f32
 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; SLEEF-SVE:    call void @_ZGVsMxvl4l4_sincospif(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SLEEF-SVE:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR9:[0-9]+]]
+;
+; SLEEF-SVE-NOPRED-LABEL: define void @sincospi_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    call void @_ZGVsNxvl4l4_sincospif(<vscale x 4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]])
+; SLEEF-SVE-NOPRED:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR75:[0-9]+]]
 ;
 ; ARMPL-NEON-LABEL: define void @sincospi_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
@@ -2698,7 +3432,11 @@ define void @sincospi_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ;
 ; ARMPL-SVE-LABEL: define void @sincospi_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    call void @armpl_svsincospi_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; ARMPL-SVE:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR9:[0-9]+]]
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sincospi_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    call void @armpl_vsincospiq_f32(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
 ;
 entry:
   br label %for.body
@@ -2730,6 +3468,11 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @sinh_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @sinh(double [[IN:%.*]]) #[[ATTR76:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @sinh_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2737,6 +3480,11 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @sinh_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsinh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sinh_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svsinh_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @sinh(double [[IN:%.*]]) #[[ATTR70:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2765,6 +3513,11 @@ define void @sinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @sinh_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @sinhf(float [[IN:%.*]]) #[[ATTR77:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @sinh_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2772,6 +3525,11 @@ define void @sinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @sinh_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsinh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sinh_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svsinh_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @sinhf(float [[IN:%.*]]) #[[ATTR71:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2803,6 +3561,11 @@ define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinpi(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @sinpi_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinpi(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]]) #[[ATTR78:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @sinpi_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinpiq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2810,6 +3573,11 @@ define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @sinpi_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsinpi_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sinpi_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svsinpi_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]]) #[[ATTR72:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2838,6 +3606,11 @@ define void @sinpi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinpif(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @sinpi_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinpif(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]]) #[[ATTR79:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @sinpi_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinpiq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2845,6 +3618,11 @@ define void @sinpi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @sinpi_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsinpi_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sinpi_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svsinpi_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]]) #[[ATTR73:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2876,6 +3654,11 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sqrt(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @sqrt_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sqrt(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]]) #[[ATTR80:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @sqrt_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsqrtq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2883,6 +3666,11 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @sqrt_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsqrt_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sqrt_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svsqrt_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]]) #[[ATTR74:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2911,6 +3699,11 @@ define void @sqrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sqrtf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @sqrt_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sqrtf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]]) #[[ATTR81:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @sqrt_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsqrtq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2918,6 +3711,11 @@ define void @sqrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @sqrt_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsqrt_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @sqrt_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svsqrt_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]]) #[[ATTR75:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2949,6 +3747,11 @@ define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @tan_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @tan(double [[IN:%.*]]) #[[ATTR82:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @tan_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtanq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2956,6 +3759,11 @@ define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @tan_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtan_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @tan_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svtan_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @tan(double [[IN:%.*]]) #[[ATTR76:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -2984,6 +3792,11 @@ define void @tan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @tan_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @tanf(float [[IN:%.*]]) #[[ATTR83:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @tan_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtanq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2991,6 +3804,11 @@ define void @tan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @tan_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtan_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @tan_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svtan_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @tanf(float [[IN:%.*]]) #[[ATTR77:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -3022,6 +3840,11 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @tanh_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @tanh(double [[IN:%.*]]) #[[ATTR84:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @tanh_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtanhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3029,6 +3852,11 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @tanh_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtanh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @tanh_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svtanh_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @tanh(double [[IN:%.*]]) #[[ATTR78:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -3057,6 +3885,11 @@ define void @tanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @tanh_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @tanhf(float [[IN:%.*]]) #[[ATTR85:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @tanh_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtanhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3064,6 +3897,11 @@ define void @tanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @tanh_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtanh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @tanh_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svtanh_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @tanhf(float [[IN:%.*]]) #[[ATTR79:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -3095,6 +3933,11 @@ define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tgamma(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @tgamma_f64
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tgamma(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]]) #[[ATTR86:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @tgamma_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtgammaq_f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3102,6 +3945,11 @@ define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @tgamma_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtgamma_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @tgamma_f64
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 2 x double> @armpl_svtgamma_f64_x(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]]) #[[ATTR80:[0-9]+]]
 ;
   entry:
   br label %for.body
@@ -3130,6 +3978,11 @@ define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tgammaf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
+; SLEEF-SVE-NOPRED-LABEL: define void @tgamma_f32
+; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tgammaf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SLEEF-SVE-NOPRED:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]]) #[[ATTR87:[0-9]+]]
+;
 ; ARMPL-NEON-LABEL: define void @tgamma_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtgammaq_f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3137,6 +3990,11 @@ define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; ARMPL-SVE-LABEL: define void @tgamma_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtgamma_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-SVE-NOPRED-LABEL: define void @tgamma_f32
+; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE-NOPRED:    [[TMP9:%.*]] = call <vscale x 4 x float> @armpl_svtgamma_f32_x(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; ARMPL-SVE-NOPRED:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]]) #[[ATTR81:[0-9]+]]
 ;
   entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
index 29440ca174248f..f60ab5e848dd3a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
@@ -21,7 +21,8 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) {
 ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
 ; NEON_INTERLEAVE:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0
 ; NEON_INTERLEAVE:    [[TMP5:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]])
-; NEON_INTERLEAVE:    [[TMP6:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]])
+; NEON_INTERLEAVE:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP3:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP6]])
 ; NEON_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]]
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear8
@@ -34,8 +35,9 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) {
 ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SVE_OR_NEON_INTERLEAVE:    [[TMP33:%.*]] = extractelement <vscale x 2 x ptr> [[TMP31:%.*]], i32 0
 ; SVE_OR_NEON_INTERLEAVE:    [[TMP34:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_OR_NEON_INTERLEAVE:    [[TMP35:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
-; SVE_OR_NEON_INTERLEAVE:    [[TMP47:%.*]] = extractelement <vscale x 2 x i1> [[TMP45:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[TMP35:%.*]] = extractelement <vscale x 2 x ptr> [[TMP32:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[TMP36:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP35]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_OR_NEON_INTERLEAVE:    [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP46:%.*]], i32 0
 ; SVE_OR_NEON_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linear8
@@ -49,8 +51,9 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) {
 ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SVE_TF_INTERLEAVE:    [[TMP33:%.*]] = extractelement <vscale x 2 x ptr> [[TMP31:%.*]], i32 0
 ; SVE_TF_INTERLEAVE:    [[TMP34:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_TF_INTERLEAVE:    [[TMP35:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
-; SVE_TF_INTERLEAVE:    [[TMP47:%.*]] = extractelement <vscale x 2 x i1> [[TMP45:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[TMP35:%.*]] = extractelement <vscale x 2 x ptr> [[TMP32:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[TMP36:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP35]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_TF_INTERLEAVE:    [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP46:%.*]], i32 0
 ; SVE_TF_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
 ;
 entry:
@@ -81,7 +84,8 @@ define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly %
 ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) {
 ; NEON_INTERLEAVE:    [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6:%.*]], i32 0
 ; NEON_INTERLEAVE:    [[TMP9:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP8]])
-; NEON_INTERLEAVE:    [[TMP10:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP8]])
+; NEON_INTERLEAVE:    [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP7:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP11:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP10]])
 ; NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]]
 ;
 ; SVE_OR_NEON-LABEL: define void @test_vector_linear4
@@ -176,7 +180,8 @@ define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n)
 ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
 ; NEON_INTERLEAVE:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP4:%.*]], i32 0
 ; NEON_INTERLEAVE:    [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]])
-; NEON_INTERLEAVE:    [[TMP8:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]])
+; NEON_INTERLEAVE:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP5:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP9:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP8]])
 ; NEON_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]]
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride
@@ -228,7 +233,9 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly
 ; NEON_INTERLEAVE:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0
 ; NEON_INTERLEAVE:    [[TMP7:%.*]] = extractelement <4 x ptr> [[TMP4:%.*]], i32 0
 ; NEON_INTERLEAVE:    [[TMP8:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]])
-; NEON_INTERLEAVE:    [[TMP9:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]])
+; NEON_INTERLEAVE:    [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP3:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP5:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP11:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP9]], ptr [[TMP10]])
 ; NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]]
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear4_linear8
@@ -243,8 +250,10 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly
 ; SVE_OR_NEON_INTERLEAVE:    [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP31:%.*]], i32 0
 ; SVE_OR_NEON_INTERLEAVE:    [[TMP36:%.*]] = extractelement <vscale x 4 x ptr> [[TMP33:%.*]], i32 0
 ; SVE_OR_NEON_INTERLEAVE:    [[TMP37:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_OR_NEON_INTERLEAVE:    [[TMP38:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
-; SVE_OR_NEON_INTERLEAVE:    [[TMP50:%.*]] = extractelement <vscale x 4 x i1> [[TMP48:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[TMP38:%.*]] = extractelement <vscale x 4 x ptr> [[TMP32:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[TMP39:%.*]] = extractelement <vscale x 4 x ptr> [[TMP34:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[TMP40:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_OR_NEON_INTERLEAVE:    [[TMP52:%.*]] = extractelement <vscale x 4 x i1> [[TMP50:%.*]], i32 0
 ; SVE_OR_NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linear4_linear8
@@ -260,8 +269,10 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly
 ; SVE_TF_INTERLEAVE:    [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP31:%.*]], i32 0
 ; SVE_TF_INTERLEAVE:    [[TMP36:%.*]] = extractelement <vscale x 4 x ptr> [[TMP33:%.*]], i32 0
 ; SVE_TF_INTERLEAVE:    [[TMP37:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_TF_INTERLEAVE:    [[TMP38:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
-; SVE_TF_INTERLEAVE:    [[TMP50:%.*]] = extractelement <vscale x 4 x i1> [[TMP48:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[TMP38:%.*]] = extractelement <vscale x 4 x ptr> [[TMP32:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[TMP39:%.*]] = extractelement <vscale x 4 x ptr> [[TMP34:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[TMP40:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_TF_INTERLEAVE:    [[TMP52:%.*]] = extractelement <vscale x 4 x i1> [[TMP50:%.*]], i32 0
 ; SVE_TF_INTERLEAVE:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]]
 ;
 entry:
@@ -293,7 +304,8 @@ define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) {
 ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) {
 ; NEON_INTERLEAVE:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0
 ; NEON_INTERLEAVE:    [[TMP5:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]])
-; NEON_INTERLEAVE:    [[TMP6:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]])
+; NEON_INTERLEAVE:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP7:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP6]])
 ; NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR4:[0-9]+]]
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr
@@ -343,7 +355,8 @@ define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) {
 ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) {
 ; NEON_INTERLEAVE:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0
 ; NEON_INTERLEAVE:    [[TMP5:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]])
-; NEON_INTERLEAVE:    [[TMP6:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]])
+; NEON_INTERLEAVE:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP7:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP6]])
 ; NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR5:[0-9]+]]
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride
@@ -393,7 +406,8 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n)
 ; NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) {
 ; NEON_INTERLEAVE:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6:%.*]], i32 0
 ; NEON_INTERLEAVE:    call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP8]])
-; NEON_INTERLEAVE:    call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP8]])
+; NEON_INTERLEAVE:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP7:%.*]], i32 0
+; NEON_INTERLEAVE:    call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP9]])
 ; NEON_INTERLEAVE:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]]
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear8_return_void
@@ -406,8 +420,9 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n)
 ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SVE_OR_NEON_INTERLEAVE:    [[TMP39:%.*]] = extractelement <vscale x 2 x ptr> [[TMP37:%.*]], i32 0
 ; SVE_OR_NEON_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_OR_NEON_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
-; SVE_OR_NEON_INTERLEAVE:    [[TMP45:%.*]] = extractelement <vscale x 2 x i1> [[TMP43:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[TMP40:%.*]] = extractelement <vscale x 2 x ptr> [[TMP38:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_OR_NEON_INTERLEAVE:    [[TMP46:%.*]] = extractelement <vscale x 2 x i1> [[TMP44:%.*]], i32 0
 ; SVE_OR_NEON_INTERLEAVE:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linear8_return_void
@@ -421,8 +436,9 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n)
 ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SVE_TF_INTERLEAVE:    [[TMP39:%.*]] = extractelement <vscale x 2 x ptr> [[TMP37:%.*]], i32 0
 ; SVE_TF_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_TF_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
-; SVE_TF_INTERLEAVE:    [[TMP45:%.*]] = extractelement <vscale x 2 x i1> [[TMP43:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[TMP40:%.*]] = extractelement <vscale x 2 x ptr> [[TMP38:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_TF_INTERLEAVE:    [[TMP46:%.*]] = extractelement <vscale x 2 x i1> [[TMP44:%.*]], i32 0
 ; SVE_TF_INTERLEAVE:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll
new file mode 100644
index 00000000000000..034af89112cb63
--- /dev/null
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll
@@ -0,0 +1,68 @@
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt -passes=lowertypetests %s -o %t.o
+; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-foobar
+; CHECK-foobar: {{llvm.global.annotations = .*[foo|bar], .*[foo|bar],}}
+; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-cfi
+; CHECK-cfi-NOT: {{llvm.global.annotations = .*cfi.*}}
+
+target triple = "aarch64-none-linux-gnu"
+
+@.src = private unnamed_addr constant [7 x i8] c"test.c\00", align 1
+@.str = private unnamed_addr constant [30 x i8] c"annotation_string_literal_bar\00", section "llvm.metadata"
+@.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", section "llvm.metadata"
+@.str.2 = private unnamed_addr constant [30 x i8] c"annotation_string_literal_foo\00", section "llvm.metadata"
+@llvm.global.annotations = appending global [2 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @bar, ptr @.str, ptr @.str.1, i32 2, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @foo, ptr @.str.2, ptr @.str.1, i32 1, ptr null }], section "llvm.metadata"
+
+define i32 @bar(i32 noundef %0) #0 !type !8 !type !9 {
+  %2 = alloca i32, align 4
+  store i32 %0, ptr %2, align 4
+  %3 = load i32, ptr %2, align 4
+  %4 = call i32 @foo(i32 noundef %3)
+  ret i32 %4
+}
+
+declare !type !8 !type !9 i32 @foo(i32 noundef) #1
+
+define i32 @test(i32 noundef %0) #0 !type !8 !type !9 {
+  %2 = alloca i32, align 4
+  %3 = alloca ptr, align 8
+  store i32 %0, ptr %2, align 4
+  %4 = load i32, ptr %2, align 4
+  %5 = icmp sgt i32 %4, 0
+  %6 = zext i1 %5 to i64
+  %7 = select i1 %5, ptr @foo, ptr @bar
+  store ptr %7, ptr %3, align 8
+  %8 = load ptr, ptr %3, align 8
+  %9 = call i1 @llvm.type.test(ptr %8, metadata !"_ZTSFiiE"), !nosanitize !10
+  br i1 %9, label %11, label %10, !nosanitize !10
+
+10:
+  call void @llvm.ubsantrap(i8 2) #4, !nosanitize !10
+  unreachable, !nosanitize !10
+
+11:
+  %12 = load i32, ptr %2, align 4
+  %13 = call i32 %8(i32 noundef %12)
+  ret i32 %13
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.ubsantrap(i8 immarg)
+
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" }
+attributes #1 = { "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" }
+attributes #4 = { noreturn nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 4, !"CFI Canonical Jump Tables", i32 0}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{i32 1, !"ThinLTO", i32 0}
+!6 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+!8 = !{i64 0, !"_ZTSFiiE"}
+!9 = !{i64 0, !"_ZTSFiiE.generalized"}
+!10 = !{}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll b/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll
new file mode 100644
index 00000000000000..5c85c0d21f59f7
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -O3 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define internal void @acc(ptr noalias noundef %val, ptr noalias noundef %prev) {
+entry:
+  %0 = load i8, ptr %prev, align 1
+  %conv = zext i8 %0 to i32
+  %1 = load i8, ptr %val, align 1
+  %conv1 = zext i8 %1 to i32
+  %add = add nsw i32 %conv1, %conv
+  %conv2 = trunc i32 %add to i8
+  store i8 %conv2, ptr %val, align 1
+  ret void
+}
+
+; This loop should not get vectorized.
+define void @accsum(ptr noundef %vals, i64 noundef %num) #0 {
+; CHECK-LABEL: define void @accsum(
+; CHECK-SAME: ptr nocapture noundef [[VALS:%.*]], i64 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[NUM]], 1
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[LOAD_INITIAL:%.*]] = load i8, ptr [[VALS]], align 1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[STORE_FORWARDED:%.*]] = phi i8 [ [[LOAD_INITIAL]], [[FOR_BODY_PREHEADER]] ], [ [[ADD_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[VALS]], i64 [[I_02]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[ADD_I]] = add i8 [[TMP0]], [[STORE_FORWARDED]]
+; CHECK-NEXT:    store i8 [[ADD_I]], ptr [[ARRAYIDX]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[INC]] = add nuw i64 [[I_02]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUM]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i64 [ 1, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ult i64 %i.0, %num
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i8, ptr %vals, i64 %i.0
+  %sub = sub i64 %i.0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %vals, i64 %sub
+  call void @acc(ptr noundef %arrayidx, ptr noundef %arrayidx1)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i64 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+attributes #0 = { "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87"}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"acc: %val"}
+; CHECK: [[META2]] = distinct !{[[META2]], !"acc"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]], !"acc: %prev"}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/call-arg-reduced-by-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/call-arg-reduced-by-minbitwidth.ll
new file mode 100644
index 00000000000000..49e89feb475b95
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/call-arg-reduced-by-minbitwidth.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-pc-windows-msvc19.34.0 < %s | FileCheck %s
+
+define void @test(ptr %0, i8 %1, i1 %cmp12.i) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[TMP0:%.*]], i8 [[TMP1:%.*]], i1 [[CMP12_I:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i1> poison, i1 [[CMP12_I]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[PRE:%.*]]
+; CHECK:       pre:
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP6]], <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[TMP7]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP8]], <8 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <8 x i32> [[TMP9]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP10]], ptr [[TMP0]], align 1
+; CHECK-NEXT:    br label [[PRE]]
+;
+entry:
+  %idx11 = getelementptr i8, ptr %0, i64 1
+  %idx22 = getelementptr i8, ptr %0, i64 2
+  %idx33 = getelementptr i8, ptr %0, i64 3
+  %idx44 = getelementptr i8, ptr %0, i64 4
+  %idx55 = getelementptr i8, ptr %0, i64 5
+  %idx66 = getelementptr i8, ptr %0, i64 6
+  %idx77 = getelementptr i8, ptr %0, i64 7
+  br label %pre
+
+pre:
+  %conv.i = zext i8 %1 to i32
+  %2 = tail call i32 @llvm.umax.i32(i32 %conv.i, i32 1)
+  %.sroa.speculated.i = add i32 %2, 1
+  %intensity.0.i = select i1 %cmp12.i, i32 %.sroa.speculated.i, i32 %conv.i
+  %conv14.i = trunc i32 %intensity.0.i to i8
+  store i8 %conv14.i, ptr %0, align 1
+  %conv.i.1 = zext i8 %1 to i32
+  %3 = tail call i32 @llvm.umax.i32(i32 %conv.i.1, i32 1)
+  %ss1 = add i32 %3, 1
+  %ii1 = select i1 %cmp12.i, i32 %ss1, i32 %conv.i.1
+  %conv14.i.1 = trunc i32 %ii1 to i8
+  store i8 %conv14.i.1, ptr %idx11, align 1
+  %conv.i.2 = zext i8 %1 to i32
+  %4 = tail call i32 @llvm.umax.i32(i32 %conv.i.2, i32 1)
+  %ss2 = add i32 %4, 1
+  %ii2 = select i1 %cmp12.i, i32 %ss2, i32 %conv.i.2
+  %conv14.i.2 = trunc i32 %ii2 to i8
+  store i8 %conv14.i.2, ptr %idx22, align 1
+  %conv.i.3 = zext i8 %1 to i32
+  %5 = tail call i32 @llvm.umax.i32(i32 %conv.i.3, i32 1)
+  %ss3 = add i32 %5, 1
+  %ii3 = select i1 %cmp12.i, i32 %ss3, i32 %conv.i.3
+  %conv14.i.3 = trunc i32 %ii3 to i8
+  store i8 %conv14.i.3, ptr %idx33, align 1
+  %conv.i.4 = zext i8 %1 to i32
+  %6 = tail call i32 @llvm.umax.i32(i32 %conv.i.4, i32 1)
+  %ss4 = add i32 %6, 1
+  %ii4 = select i1 %cmp12.i, i32 %ss4, i32 %conv.i.4
+  %conv14.i.4 = trunc i32 %ii4 to i8
+  store i8 %conv14.i.4, ptr %idx44, align 1
+  %conv.i.5 = zext i8 %1 to i32
+  %7 = tail call i32 @llvm.umax.i32(i32 %conv.i.5, i32 1)
+  %ss5 = add i32 %7, 1
+  %ii5 = select i1 %cmp12.i, i32 %ss5, i32 %conv.i.5
+  %conv14.i.5 = trunc i32 %ii5 to i8
+  store i8 %conv14.i.5, ptr %idx55, align 1
+  %conv.i.6 = zext i8 %1 to i32
+  %8 = tail call i32 @llvm.umax.i32(i32 %conv.i.6, i32 1)
+  %ss6 = add i32 %8, 1
+  %ii6 = select i1 %cmp12.i, i32 %ss6, i32 %conv.i.6
+  %conv14.i.6 = trunc i32 %ii6 to i8
+  store i8 %conv14.i.6, ptr %idx66, align 1
+  %conv.i.7 = zext i8 %1 to i32
+  %9 = tail call i32 @llvm.umax.i32(i32 %conv.i.7, i32 1)
+  %ss7 = add i32 %9, 1
+  %ii7 = select i1 %cmp12.i, i32 %ss7, i32 %conv.i.7
+  %conv14.i.7 = trunc i32 %ii7 to i8
+  store i8 %conv14.i.7, ptr %idx77, align 1
+  br label %pre
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
new file mode 100644
index 00000000000000..ba406c8f20bb08
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v3 < %s | FileCheck %s
+
+define void @test(double %i) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: double [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> zeroinitializer, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[I]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP3]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <4 x i32> <i32 poison, i32 0, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP4]], i32 7
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <8 x double> zeroinitializer, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <8 x double> zeroinitializer, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <8 x double> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fcmp ult <8 x double> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    br label [[BB116:%.*]]
+; CHECK:       bb116:
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[TMP15]], i32 1
+; CHECK-NEXT:    [[I120:%.*]] = fadd double [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <2 x double> zeroinitializer, [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[TMP18]], i32 1
+; CHECK-NEXT:    [[I128:%.*]] = fadd double [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[I139:%.*]] = call double @llvm.maxnum.f64(double [[I128]], double 0.000000e+00)
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x double> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP22]], <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul <2 x double> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = fptosi <2 x double> [[TMP24]] to <2 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = sub <2 x i32> zeroinitializer, [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[I147:%.*]] = fcmp ogt double [[I120]], 0.000000e+00
+; CHECK-NEXT:    ret void
+;
+bb:
+  %i74 = fsub double 0.000000e+00, poison
+  %i75 = fsub double 0.000000e+00, %i
+  %i76 = fmul double 0.000000e+00, %i75
+  %i77 = fadd double %i76, 0.000000e+00
+  %i78 = fadd double %i77, 0.000000e+00
+  %i79 = fcmp ult double %i78, 0.000000e+00
+  %i81 = fsub double %i, 0.000000e+00
+  %i82 = fmul double 0.000000e+00, %i81
+  %i83 = fadd double 0.000000e+00, %i82
+  %i84 = fadd double %i83, 0.000000e+00
+  %i85 = fcmp ult double %i84, 0.000000e+00
+  %i86 = fsub double 0.000000e+00, %i
+  %i87 = fmul double 0.000000e+00, %i86
+  %i88 = fadd double %i87, 0.000000e+00
+  %i89 = fadd double %i88, 0.000000e+00
+  %i90 = fcmp ult double %i89, 0.000000e+00
+  %i91 = fsub double 0.000000e+00, 0.000000e+00
+  %i92 = fmul double 0.000000e+00, 0.000000e+00
+  %i93 = fadd double %i92, 0.000000e+00
+  %i94 = fadd double %i93, 0.000000e+00
+  %i95 = fcmp ult double %i94, 0.000000e+00
+  %i96 = fsub double poison, 0.000000e+00
+  %i97 = fadd double %i77, 0.000000e+00
+  %i98 = fcmp ult double %i97, 0.000000e+00
+  %i99 = fadd double %i83, 0.000000e+00
+  %i100 = fcmp ult double %i99, 0.000000e+00
+  %i101 = fmul double 0.000000e+00, 0.000000e+00
+  %i102 = fadd double %i101, 0.000000e+00
+  %i103 = fadd double %i102, 0.000000e+00
+  %i104 = fcmp ult double %i103, 0.000000e+00
+  %i105 = fmul double 0.000000e+00, 0.000000e+00
+  %i106 = fadd double %i105, 0.000000e+00
+  %i107 = fadd double %i106, 0.000000e+00
+  %i108 = fcmp ult double %i107, 0.000000e+00
+  br label %bb116
+
+bb116:
+  %i117 = fmul double 0.000000e+00, %i81
+  %i119 = fmul double 0.000000e+00, %i96
+  %i120 = fadd double %i117, %i119
+  %i121 = fmul double 0.000000e+00, %i74
+  %i122 = fmul double 0.000000e+00, %i75
+  %i123 = fadd double %i122, 0.000000e+00
+  %i124 = fmul double 0.000000e+00, %i91
+  %i125 = fadd double %i124, 0.000000e+00
+  %i127 = fmul double 0.000000e+00, %i86
+  %i128 = fadd double %i127, %i121
+  %i133 = call double @llvm.maxnum.f64(double %i123, double 0.000000e+00)
+  %i134 = fmul double %i133, 0.000000e+00
+  %i135 = fptosi double %i134 to i32
+  %i136 = sub i32 0, %i135
+  %i137 = icmp sgt i32 %i136, 0
+  %i139 = call double @llvm.maxnum.f64(double %i128, double 0.000000e+00)
+  %i142 = call double @llvm.maxnum.f64(double %i125, double 0.000000e+00)
+  %i143 = fmul double %i142, 0.000000e+00
+  %i144 = fptosi double %i143 to i32
+  %i145 = sub i32 0, %i144
+  %i146 = icmp sgt i32 %i145, 0
+  %i147 = fcmp ogt double %i120, 0.000000e+00
+  ret void
+}
+
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
new file mode 100644
index 00000000000000..f665dac3282b79
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v3 -S < %s | FileCheck %s
+
+define void @foo(double %i) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: double [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00>, double [[I]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x i32> <i32 8, i32 poison, i32 2, i32 poison, i32 poison, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> [[TMP7]], double [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 5, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP5]], i32 6
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul <8 x double> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fcmp ult <8 x double> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = freeze <8 x i1> [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP16]])
+; CHECK-NEXT:    br i1 [[TMP17]], label [[BB58:%.*]], label [[BB115:%.*]]
+; CHECK:       bb115:
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1
+; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, <4 x double> [[TMP22]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul <4 x double> [[TMP27]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = fptosi <4 x double> [[TMP28]] to <4 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = or <4 x i32> zeroinitializer, [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP30]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = icmp slt i32 [[TMP31]], 32000
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP31]], i32 32000
+; CHECK-NEXT:    [[I163:%.*]] = fcmp ogt double [[I118]], 0.000000e+00
+; CHECK-NEXT:    [[I164:%.*]] = icmp slt i32 0, [[OP_RDX1]]
+; CHECK-NEXT:    unreachable
+; CHECK:       bb58:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %i75 = fsub double 0.000000e+00, 0.000000e+00
+  %i76 = fsub double 0.000000e+00, 0.000000e+00
+  %i77 = fmul double 0.000000e+00, %i75
+  %i78 = fmul double 0.000000e+00, %i76
+  %i79 = fadd double %i78, 0.000000e+00
+  %i80 = fadd double %i79, 0.000000e+00
+  %i81 = fcmp ult double %i80, 0.000000e+00
+  %i82 = fsub double 0.000000e+00, poison
+  %i83 = fmul double 0.000000e+00, %i82
+  %i84 = fadd double 0.000000e+00, %i83
+  %i85 = fadd double %i84, 0.000000e+00
+  %i86 = fcmp ult double %i85, 0.000000e+00
+  %i87 = fsub double 0.000000e+00, %i
+  %i88 = fadd double 0.000000e+00, %i77
+  %i89 = fadd double %i88, 0.000000e+00
+  %i90 = fcmp ult double %i89, 0.000000e+00
+  %i91 = fsub double 0.000000e+00, 0.000000e+00
+  %i92 = fmul double poison, 0.000000e+00
+  %i93 = fadd double %i92, 0.000000e+00
+  %i94 = fadd double %i93, 0.000000e+00
+  %i95 = fcmp ult double %i94, 0.000000e+00
+  %i96 = fadd double %i79, 0.000000e+00
+  %i97 = fcmp ult double %i96, 0.000000e+00
+  %i98 = fadd double %i84, 0.000000e+00
+  %i99 = fcmp ult double %i98, 0.000000e+00
+  %i100 = fadd double 0.000000e+00, %i77
+  %i101 = fadd double %i100, 0.000000e+00
+  %i102 = fcmp ult double %i101, 0.000000e+00
+  %i103 = fsub double 0.000000e+00, %i
+  %i104 = fmul double poison, 0.000000e+00
+  %i105 = fadd double %i104, 0.000000e+00
+  %i106 = fadd double %i105, 0.000000e+00
+  %i107 = fcmp ult double %i106, 0.000000e+00
+  %i108 = select i1 %i107, i1 %i102, i1 false
+  %i109 = select i1 %i108, i1 %i99, i1 false
+  %i110 = select i1 %i109, i1 %i97, i1 false
+  %i111 = select i1 %i110, i1 %i95, i1 false
+  %i112 = select i1 %i111, i1 %i90, i1 false
+  %i113 = select i1 %i112, i1 %i86, i1 false
+  %i114 = select i1 %i113, i1 %i81, i1 false
+  br i1 %i114, label %bb58, label %bb115
+
+bb115:
+  %i116 = fmul double 0.000000e+00, %i103
+  %i117 = fmul double 0.000000e+00, %i82
+  %i118 = fadd double %i116, %i117
+  %i120 = fmul double 0.000000e+00, %i75
+  %i121 = fmul double 0.000000e+00, %i76
+  %i122 = fadd double %i121, 0.000000e+00
+  %i123 = fadd double 0.000000e+00, %i120
+  %i124 = fmul double 0.000000e+00, %i91
+  %i125 = fadd double %i124, %i82
+  %i126 = fadd double %i125, 0.000000e+00
+  %i127 = fmul double 0.000000e+00, %i87
+  %i128 = fadd double %i127, 0.000000e+00
+  %i129 = fadd double %i128, 0.000000e+00
+  %i130 = fadd double %i122, 0.000000e+00
+  %i131 = fadd double %i123, 0.000000e+00
+  %i132 = select i1 false, double 0.000000e+00, double %i131
+  %i133 = fmul double %i132, 0.000000e+00
+  %i134 = fmul double %i133, 0.000000e+00
+  %i135 = fptosi double %i134 to i32
+  %i136 = or i32 0, %i135
+  %i137 = icmp slt i32 %i136, 32000
+  %i138 = select i1 %i137, i32 %i136, i32 32000
+  %i139 = select i1 false, double 0.000000e+00, double %i130
+  %i140 = fmul double %i139, 0.000000e+00
+  %i141 = fmul double %i140, 0.000000e+00
+  %i142 = fptosi double %i141 to i32
+  %i143 = or i32 0, %i142
+  %i144 = icmp slt i32 %i143, %i138
+  %i145 = select i1 %i144, i32 %i143, i32 %i138
+  %i146 = select i1 false, double 0.000000e+00, double %i129
+  %i147 = fmul double %i146, 0.000000e+00
+  %i148 = fmul double %i147, 0.000000e+00
+  %i149 = fptosi double %i148 to i32
+  %i150 = or i32 0, %i149
+  %i151 = icmp slt i32 %i150, %i145
+  %i152 = select i1 %i151, i32 %i150, i32 %i145
+  %i153 = select i1 false, double 0.000000e+00, double %i126
+  %i154 = fmul double %i153, 0.000000e+00
+  %i155 = fmul double %i154, 0.000000e+00
+  %i156 = fptosi double %i155 to i32
+  %i157 = or i32 0, %i156
+  %i158 = icmp slt i32 %i157, %i152
+  %i159 = select i1 %i158, i32 %i157, i32 %i152
+  %i163 = fcmp ogt double %i118, 0.000000e+00
+  %i164 = icmp slt i32 0, %i159
+  unreachable
+
+bb58:
+  ret void
+}
diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index ee35a0fd5fea65..f0e605fb4ad5c9 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -1388,6 +1388,68 @@ define <4 x ptr> @ptrLoadStoreTysPtr(ptr %init, i64 %val2) {
   ret <4 x ptr> %sroaval
 }
 
+define <4 x i32> @validLoadStoreTy([2 x i64] %cond.coerce) {
+; CHECK-LABEL: @validLoadStoreTy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE:%.*]], 0
+; CHECK-NEXT:    [[COND_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[COND_COERCE_FCA_0_EXTRACT]], i32 0
+; CHECK-NEXT:    [[COND_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE]], 1
+; CHECK-NEXT:    [[COND_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[COND_SROA_0_0_VEC_INSERT]], i64 [[COND_COERCE_FCA_1_EXTRACT]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[COND_SROA_0_8_VEC_INSERT]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+;
+; DEBUG-LABEL: @validLoadStoreTy(
+; DEBUG-NEXT:  entry:
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META553:![0-9]+]], metadata !DIExpression()), !dbg [[DBG557:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META554:![0-9]+]], metadata !DIExpression()), !dbg [[DBG558:![0-9]+]]
+; DEBUG-NEXT:    [[COND_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE:%.*]], 0, !dbg [[DBG559:![0-9]+]]
+; DEBUG-NEXT:    [[COND_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[COND_COERCE_FCA_0_EXTRACT]], i32 0, !dbg [[DBG559]]
+; DEBUG-NEXT:    [[COND_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE]], 1, !dbg [[DBG559]]
+; DEBUG-NEXT:    [[COND_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[COND_SROA_0_0_VEC_INSERT]], i64 [[COND_COERCE_FCA_1_EXTRACT]], i32 1, !dbg [[DBG559]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META555:![0-9]+]], metadata !DIExpression()), !dbg [[DBG560:![0-9]+]]
+; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[COND_SROA_0_8_VEC_INSERT]] to <4 x i32>, !dbg [[DBG561:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[TMP0]], metadata [[META556:![0-9]+]], metadata !DIExpression()), !dbg [[DBG561]]
+; DEBUG-NEXT:    ret <4 x i32> [[TMP0]], !dbg [[DBG562:![0-9]+]]
+;
+entry:
+  %cond = alloca <4 x i32>, align 8
+  %coerce.dive2 = getelementptr inbounds <4 x i32>, ptr %cond, i32 0, i32 0
+  store [2 x i64] %cond.coerce, ptr %coerce.dive2, align 8
+  %m5 = getelementptr inbounds <4 x i32>, ptr %cond, i32 0, i32 0
+  %0 = load <4 x i32>, ptr %m5, align 8
+  ret <4 x i32> %0
+}
+
+; The following test should not crash the compiler
+; (calls to CheckCandidateType from createAndCheckVectorTypesForPromotion may change the memory to hold CandidateTys.data())
+define noundef zeroext i1 @CandidateTysRealloc() personality ptr null {
+entry:
+  %alloca = alloca <4x i32>, align 16
+  store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr %alloca, align 16
+  br label %bb.1
+
+bb.1:
+  br label %bb.1
+
+bb.2:
+  %Load0 = load <4 x i32>, ptr %alloca, align 16
+  store <4 x i32> zeroinitializer, ptr %alloca, align 16
+  %Load1 = load <4 x i32>, ptr %alloca, align 16
+  br label %bb.3
+
+bb.3:
+  br label %bb.3
+
+bb.4:
+  %Load2 = load i64, ptr %alloca, align 16
+  %Load3 = load <4 x i32>, ptr %alloca, align 16
+  store <4 x i32> zeroinitializer, ptr %alloca, align 16
+  br label %bb.5
+
+bb.5:
+  br label %bb.5
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
 declare void @llvm.lifetime.end.p0(i64, ptr)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll b/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll
index 2aaf777683e116..c6e6608d4be383 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll
@@ -115,3 +115,107 @@ split:
 exit:
   ret void
 }
+
+; Variants of the above test with swapped branch destinations.
+
+define void @test1_swapped(i1 %c) {
+; CHECK-LABEL: define void @test1_swapped(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[C_FR:%.*]] = freeze i1 [[C]]
+; CHECK-NEXT:    br i1 [[C_FR]], label [[START_SPLIT_US:%.*]], label [[START_SPLIT:%.*]]
+; CHECK:       start.split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[LOOP_US]]
+; CHECK:       start.split:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+start:
+  br label %loop
+
+loop:
+  %fn = load ptr, ptr @vtable, align 8
+  call void %fn()
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test2_swapped(i1 %c, ptr %p) {
+; CHECK-LABEL: define void @test2_swapped(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[C_FR:%.*]] = freeze i1 [[C]]
+; CHECK-NEXT:    br i1 [[C_FR]], label [[DOTSPLIT_US:%.*]], label [[DOTSPLIT:%.*]]
+; CHECK:       .split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[LOOP_US]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+  br label %loop
+
+loop:
+  %fn = load ptr, ptr @vtable, align 8
+  call void %fn()
+  call void @bar()
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test3_swapped(i1 %c, ptr %p) {
+; CHECK-LABEL: define void @test3_swapped(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[C_FR:%.*]] = freeze i1 [[C]]
+; CHECK-NEXT:    br i1 [[C_FR]], label [[DOTSPLIT_US:%.*]], label [[DOTSPLIT:%.*]]
+; CHECK:       .split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    br label [[SPLIT_US:%.*]]
+; CHECK:       split.us:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[LOOP_US]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    br label [[SPLIT:%.*]]
+; CHECK:       split:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+  br label %loop
+
+loop:
+  %fn = load ptr, ptr @vtable, align 8
+  br label %split
+
+split:
+  call void %fn()
+  call void @bar()
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index 8ccd27030afae8..7b12de90319012 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -23,36 +23,30 @@
 ; LIBMVEC-X86-SAME:   ptr @_ZGVdN4v_sin
 ; SLEEFGNUABI-SAME: [16 x ptr] [
 ; SLEEFGNUABI-SAME:   ptr @_ZGVnN2vl8_modf,
-; SLEEFGNUABI-SAME:   ptr @_ZGVsMxvl8_modf,
+; SLEEFGNUABI-SAME:   ptr @_ZGVsNxvl8_modf,
 ; SLEEFGNUABI-SAME:   ptr @_ZGVnN4vl4_modff,
-; SLEEFGNUABI-SAME:   ptr @_ZGVsMxvl4_modff,
+; SLEEFGNUABI-SAME:   ptr @_ZGVsNxvl4_modff,
 ; SLEEFGNUABI-SAME:   ptr @_ZGVnN2v_sin,
 ; SLEEFGNUABI-SAME:   ptr @_ZGVsMxv_sin,
 ; SLEEFGNUABI-SAME:   ptr @_ZGVnN2vl8l8_sincos,
-; SLEEFGNUABI-SAME:   ptr @_ZGVsMxvl8l8_sincos,
+; SLEEFGNUABI-SAME:   ptr @_ZGVsNxvl8l8_sincos,
 ; SLEEFGNUABI-SAME:   ptr @_ZGVnN4vl4l4_sincosf,
-; SLEEFGNUABI-SAME:   ptr @_ZGVsMxvl4l4_sincosf,
+; SLEEFGNUABI-SAME:   ptr @_ZGVsNxvl4l4_sincosf,
 ; SLEEFGNUABI-SAME:   ptr @_ZGVnN2vl8l8_sincospi,
-; SLEEFGNUABI-SAME:   ptr @_ZGVsMxvl8l8_sincospi,
+; SLEEFGNUABI-SAME:   ptr @_ZGVsNxvl8l8_sincospi,
 ; SLEEFGNUABI-SAME:   ptr @_ZGVnN4vl4l4_sincospif,
-; SLEEFGNUABI-SAME:   ptr @_ZGVsMxvl4l4_sincospif,
+; SLEEFGNUABI-SAME:   ptr @_ZGVsNxvl4l4_sincospif,
 ; SLEEFGNUABI_SAME;   ptr @_ZGVnN4v_log10f,
 ; SLEEFGNUABI-SAME:   ptr @_ZGVsMxv_log10f
-; ARMPL-SAME:       [16 x ptr] [
+; ARMPL-SAME:       [10 x ptr] [
 ; ARMPL-SAME:         ptr @armpl_vmodfq_f64,
-; ARMPL-SAME:         ptr @armpl_svmodf_f64_x,
 ; ARMPL-SAME:         ptr @armpl_vmodfq_f32,
-; ARMPL-SAME:         ptr @armpl_svmodf_f32_x,
 ; ARMPL-SAME:         ptr @armpl_vsinq_f64,
 ; ARMPL-SAME:         ptr @armpl_svsin_f64_x,
 ; ARMPL-SAME:         ptr @armpl_vsincosq_f64,
-; ARMPL-SAME:         ptr @armpl_svsincos_f64_x,
 ; ARMPL-SAME:         ptr @armpl_vsincosq_f32,
-; ARMPL-SAME:         ptr @armpl_svsincos_f32_x,
 ; ARMPL-SAME:         ptr @armpl_vsincospiq_f64,
-; ARMPL-SAME:         ptr @armpl_svsincospi_f64_x,
 ; ARMPL-SAME:         ptr @armpl_vsincospiq_f32,
-; ARMPL-SAME:         ptr @armpl_svsincospi_f32_x,
 ; ARMPL-SAME:         ptr @armpl_vlog10q_f32,
 ; ARMPL-SAME:         ptr @armpl_svlog10_f32_x
 ; COMMON-SAME:      ], section "llvm.metadata"
@@ -166,36 +160,30 @@ declare float @llvm.log10.f32(float) #0
 ; ACCELERATE: declare <4 x float> @vlog10f(<4 x float>)
 
 ; SLEEFGNUABI: declare <2 x double> @_ZGVnN2vl8_modf(<2 x double>, ptr)
-; SLEEFGNUABI: declare <vscale x 2 x double> @_ZGVsMxvl8_modf(<vscale x 2 x double>, ptr, <vscale x 2 x i1>)
+; SLEEFGNUABI: declare <vscale x 2 x double> @_ZGVsNxvl8_modf(<vscale x 2 x double>, ptr)
 ; SLEEFGNUABI: declare <4 x float> @_ZGVnN4vl4_modff(<4 x float>, ptr)
-; SLEEFGNUABI: declare <vscale x 4 x float> @_ZGVsMxvl4_modff(<vscale x 4 x float>, ptr, <vscale x 4 x i1>)
+; SLEEFGNUABI: declare <vscale x 4 x float> @_ZGVsNxvl4_modff(<vscale x 4 x float>, ptr)
 ; SLEEFGNUABI: declare <2 x double> @_ZGVnN2v_sin(<2 x double>)
 ; SLEEFGNUABI: declare <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double>, <vscale x 2 x i1>)
 ; SLEEFGNUABI: declare void @_ZGVnN2vl8l8_sincos(<2 x double>, ptr, ptr)
-; SLEEFGNUABI: declare void @_ZGVsMxvl8l8_sincos(<vscale x 2 x double>, ptr, ptr, <vscale x 2 x i1>)
+; SLEEFGNUABI: declare void @_ZGVsNxvl8l8_sincos(<vscale x 2 x double>, ptr, ptr)
 ; SLEEFGNUABI: declare void @_ZGVnN4vl4l4_sincosf(<4 x float>, ptr, ptr)
-; SLEEFGNUABI: declare void @_ZGVsMxvl4l4_sincosf(<vscale x 4 x float>, ptr, ptr, <vscale x 4 x i1>)
+; SLEEFGNUABI: declare void @_ZGVsNxvl4l4_sincosf(<vscale x 4 x float>, ptr, ptr)
 ; SLEEFGNUABI: declare void @_ZGVnN2vl8l8_sincospi(<2 x double>, ptr, ptr)
-; SLEEFGNUABI: declare void @_ZGVsMxvl8l8_sincospi(<vscale x 2 x double>, ptr, ptr, <vscale x 2 x i1>)
+; SLEEFGNUABI: declare void @_ZGVsNxvl8l8_sincospi(<vscale x 2 x double>, ptr, ptr)
 ; SLEEFGNUABI: declare void @_ZGVnN4vl4l4_sincospif(<4 x float>, ptr, ptr)
-; SLEEFGNUABI: declare void @_ZGVsMxvl4l4_sincospif(<vscale x 4 x float>, ptr, ptr, <vscale x 4 x i1>)
+; SLEEFGNUABI: declare void @_ZGVsNxvl4l4_sincospif(<vscale x 4 x float>, ptr, ptr)
 ; SLEEFGNUABI: declare <4 x float> @_ZGVnN4v_log10f(<4 x float>)
 ; SLEEFGNUABI: declare <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float>, <vscale x 4 x i1>)
 
 ; ARMPL: declare <2 x double> @armpl_vmodfq_f64(<2 x double>, ptr)
-; ARMPL: declare <vscale x 2 x double> @armpl_svmodf_f64_x(<vscale x 2 x double>, ptr, <vscale x 2 x i1>)
 ; ARMPL: declare <4 x float> @armpl_vmodfq_f32(<4 x float>, ptr)
-; ARMPL: declare <vscale x 4 x float> @armpl_svmodf_f32_x(<vscale x 4 x float>, ptr, <vscale x 4 x i1>)
 ; ARMPL: declare <2 x double> @armpl_vsinq_f64(<2 x double>)
 ; ARMPL: declare <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double>, <vscale x 2 x i1>)
 ; ARMPL: declare void @armpl_vsincosq_f64(<2 x double>, ptr, ptr)
-; ARMPL: declare void @armpl_svsincos_f64_x(<vscale x 2 x double>, ptr, ptr, <vscale x 2 x i1>)
 ; ARMPL: declare void @armpl_vsincosq_f32(<4 x float>, ptr, ptr)
-; ARMPL: declare void @armpl_svsincos_f32_x(<vscale x 4 x float>, ptr, ptr, <vscale x 4 x i1>)
 ; ARMPL: declare void @armpl_vsincospiq_f64(<2 x double>, ptr, ptr)
-; ARMPL: declare void @armpl_svsincospi_f64_x(<vscale x 2 x double>, ptr, ptr, <vscale x 2 x i1>)
 ; ARMPL: declare void @armpl_vsincospiq_f32(<4 x float>, ptr, ptr)
-; ARMPL: declare void @armpl_svsincospi_f32_x(<vscale x 4 x float>, ptr, ptr, <vscale x 4 x i1>)
 ; ARMPL: declare <4 x float> @armpl_vlog10q_f32(<4 x float>)
 ; ARMPL: declare <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float>, <vscale x 4 x i1>)
 
@@ -220,50 +208,44 @@ attributes #0 = { nounwind readnone }
 
 ; SLEEFGNUABI:      attributes #[[MODF]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N2vl8_modf(_ZGVnN2vl8_modf),
-; SLEEFGNUABI-SAME:   _ZGVsMxvl8_modf(_ZGVsMxvl8_modf)" }
+; SLEEFGNUABI-SAME:   _ZGVsNxvl8_modf(_ZGVsNxvl8_modf)" }
 ; SLEEFGNUABI:      attributes #[[MODFF]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N4vl4_modff(_ZGVnN4vl4_modff),
-; SLEEFGNUABI-SAME:   _ZGVsMxvl4_modff(_ZGVsMxvl4_modff)" }
+; SLEEFGNUABI-SAME:   _ZGVsNxvl4_modff(_ZGVsNxvl4_modff)" }
 ; SLEEFGNUABI:      attributes #[[SIN]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N2v_sin(_ZGVnN2v_sin),
 ; SLEEFGNUABI-SAME:   _ZGVsMxv_sin(_ZGVsMxv_sin)" }
 ; SLEEFGNUABI:      attributes #[[SINCOS]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N2vl8l8_sincos(_ZGVnN2vl8l8_sincos),
-; SLEEFGNUABI-SAME:   _ZGVsMxvl8l8_sincos(_ZGVsMxvl8l8_sincos)" }
+; SLEEFGNUABI-SAME:   _ZGVsNxvl8l8_sincos(_ZGVsNxvl8l8_sincos)" }
 ; SLEEFGNUABI:      attributes #[[SINCOSF]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N4vl4l4_sincosf(_ZGVnN4vl4l4_sincosf),
-; SLEEFGNUABI-SAME:   _ZGVsMxvl4l4_sincosf(_ZGVsMxvl4l4_sincosf)" }
+; SLEEFGNUABI-SAME:   _ZGVsNxvl4l4_sincosf(_ZGVsNxvl4l4_sincosf)" }
 ; SLEEFGNUABI:      attributes #[[SINCOSPI]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N2vl8l8_sincospi(_ZGVnN2vl8l8_sincospi),
-; SLEEFGNUABI-SAME:   _ZGVsMxvl8l8_sincospi(_ZGVsMxvl8l8_sincospi)" }
+; SLEEFGNUABI-SAME:   _ZGVsNxvl8l8_sincospi(_ZGVsNxvl8l8_sincospi)" }
 ; SLEEFGNUABI:      attributes #[[SINCOSPIF]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N4vl4l4_sincospif(_ZGVnN4vl4l4_sincospif),
-; SLEEFGNUABI-SAME:   _ZGVsMxvl4l4_sincospif(_ZGVsMxvl4l4_sincospif)" }
+; SLEEFGNUABI-SAME:   _ZGVsNxvl4l4_sincospif(_ZGVsNxvl4l4_sincospif)" }
 ; SLEEFGNUABI:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N4v_llvm.log10.f32(_ZGVnN4v_log10f),
 ; SLEEFGNUABI-SAME:   _ZGVsMxv_llvm.log10.f32(_ZGVsMxv_log10f)" }
 
 ; ARMPL:      attributes #[[MODF]] = { "vector-function-abi-variant"=
-; ARMPL-SAME:    "_ZGV_LLVM_N2vl8_modf(armpl_vmodfq_f64),
-; ARMPL-SAME:    _ZGVsMxvl8_modf(armpl_svmodf_f64_x)" }
+; ARMPL-SAME:    "_ZGV_LLVM_N2vl8_modf(armpl_vmodfq_f64)" }
 ; ARMPL:      attributes #[[MODFF]] = { "vector-function-abi-variant"=
-; ARMPL-SAME:    "_ZGV_LLVM_N4vl4_modff(armpl_vmodfq_f32),
-; ARMPL-SAME:    _ZGVsMxvl4_modff(armpl_svmodf_f32_x)" }
+; ARMPL-SAME:    "_ZGV_LLVM_N4vl4_modff(armpl_vmodfq_f32)" }
 ; ARMPL:      attributes #[[SIN]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N2v_sin(armpl_vsinq_f64),
 ; ARMPL-SAME     _ZGVsMxv_sin(armpl_svsin_f64_x)" }
 ; ARMPL:      attributes #[[SINCOS]] = { "vector-function-abi-variant"=
-; ARMPL-SAME:    "_ZGV_LLVM_N2vl8l8_sincos(armpl_vsincosq_f64),
-; ARMPL-SAME:   _ZGVsMxvl8l8_sincos(armpl_svsincos_f64_x)" }
+; ARMPL-SAME:    "_ZGV_LLVM_N2vl8l8_sincos(armpl_vsincosq_f64)" }
 ; ARMPL:      attributes #[[SINCOSF]] = { "vector-function-abi-variant"=
-; ARMPL-SAME:    "_ZGV_LLVM_N4vl4l4_sincosf(armpl_vsincosq_f32),
-; ARMPL-SAME:    _ZGVsMxvl4l4_sincosf(armpl_svsincos_f32_x)" }
+; ARMPL-SAME:    "_ZGV_LLVM_N4vl4l4_sincosf(armpl_vsincosq_f32)" }
 ; ARMPL:      attributes #[[SINCOSPI]] = { "vector-function-abi-variant"=
-; ARMPL-SAME:    "_ZGV_LLVM_N2vl8l8_sincospi(armpl_vsincospiq_f64),
-; ARMPL-SAME:   _ZGVsMxvl8l8_sincospi(armpl_svsincospi_f64_x)" }
+; ARMPL-SAME:    "_ZGV_LLVM_N2vl8l8_sincospi(armpl_vsincospiq_f64)" }
 ; ARMPL:      attributes #[[SINCOSPIF]] = { "vector-function-abi-variant"=
-; ARMPL-SAME:    "_ZGV_LLVM_N4vl4l4_sincospif(armpl_vsincospiq_f32),
-; ARMPL-SAME:    _ZGVsMxvl4l4_sincospif(armpl_svsincospi_f32_x)" }
+; ARMPL-SAME:    "_ZGV_LLVM_N4vl4l4_sincospif(armpl_vsincospiq_f32)" }
 ; ARMPL:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N4v_llvm.log10.f32(armpl_vlog10q_f32),
 ; ARMPL-SAME     _ZGVsMxv_llvm.log10.f32(armpl_svlog10_f32_x)" }
diff --git a/llvm/test/Transforms/Util/flattencfg.ll b/llvm/test/Transforms/Util/flattencfg.ll
index 4a4d4279f360d6..5f8dd981293345 100644
--- a/llvm/test/Transforms/Util/flattencfg.ll
+++ b/llvm/test/Transforms/Util/flattencfg.ll
@@ -77,13 +77,16 @@ define void @test_not_crash3(i32 %a) #0 {
 ; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_EQ_0:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[A_EQ_0]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb1:
 ; CHECK-NEXT:    [[A_EQ_1:%.*]] = icmp eq i32 [[A]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[A_EQ_0]], [[A_EQ_1]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK-NEXT:    br i1 [[A_EQ_1]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[CHECK_BADREF:%.*]] = phi i32 [ 17, [[ENTRY:%.*]] ], [ 11, [[BB2]] ]
+; CHECK-NEXT:    [[CHECK_BADREF:%.*]] = phi i32 [ 17, [[BB1]] ], [ 11, [[BB2]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -278,9 +281,9 @@ define i1 @test_cond_multi_use(i32 %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:  entry.x:
 ; CHECK-NEXT:    [[CMP_X:%.*]] = icmp ne i32 [[X]], 0
 ; CHECK-NEXT:    [[CMP_Y:%.*]] = icmp eq i32 [[Y]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[CMP_Y]], true
-; CHECK-NEXT:    [[TMP1:%.*]] = or i1 [[CMP_X]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN_Y:%.*]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[CMP_Y_NOT:%.*]] = xor i1 [[CMP_Y]], true
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[CMP_X]], [[CMP_Y_NOT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN_Y:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if.then.y:
 ; CHECK-NEXT:    store i32 [[Z]], ptr @g, align 4
 ; CHECK-NEXT:    br label [[EXIT]]
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll
new file mode 100644
index 00000000000000..0a43ad2f9a3684
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv32 -mattr=+v | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+define void @fixed_load_scalable_src(ptr %p) {
+; CHECK-LABEL: define void @fixed_load_scalable_src(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <vscale x 4 x i16> zeroinitializer, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret void
+;
+entry:
+  store <vscale x 4 x i16> zeroinitializer, ptr %p
+  %0 = load <4 x i16>, ptr %p
+  %1 = shufflevector <4 x i16> %0, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret void
+}
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index c6f9ee82e08cc1..74e7769a480598 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -414,10 +414,11 @@ def version_int(ver):
     config.available_features.add("llvm-dylib")
     config.substitutions.append(
         (
+            # libLLVM.so.19.0git
             "%llvmdylib",
-            "{}/libLLVM-{}{}".format(
-                config.llvm_shlib_dir, config.llvm_dylib_version, config.llvm_shlib_ext
-            ),
+            "{}/libLLVM{}.{}".format(
+                config.llvm_shlib_dir, config.llvm_shlib_ext, config.llvm_dylib_version
+            )
         )
     )
 
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index 1138b2ccf7bce7..b6f255d472d16f 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -44,7 +44,7 @@ config.build_examples = @LLVM_BUILD_EXAMPLES@
 config.enable_threads = @LLVM_ENABLE_THREADS@
 config.build_shared_libs = @BUILD_SHARED_LIBS@
 config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@
-config.llvm_dylib_version = "@LLVM_VERSION_MAJOR@@LLVM_VERSION_SUFFIX@"
+config.llvm_dylib_version = "@LLVM_VERSION_MAJOR@.@LLVM_VERSION_MINOR@@LLVM_VERSION_SUFFIX@"
 config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
 config.host_arch = "@HOST_ARCH@"
 config.have_opt_viewer_modules = @LLVM_HAVE_OPT_VIEWER_MODULES@
diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.c b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.c
new file mode 100644
index 00000000000000..bd2b979bd257f7
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.c
@@ -0,0 +1,20 @@
+#define C c
+#define D 1
+#define E (C != a) && (C > a)
+#define F E
+
+void __attribute__((noinline)) func1(void) { return; }
+
+void __attribute__((noinline)) func(int a, int b, int c) {
+  if (a && D && E || b)
+    func1();
+  if (b && D)
+    func1();
+  if (a && (b && C) || (D && F))
+    func1();
+}
+
+int main() {
+  func(2, 3, 3);
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.o b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.o
new file mode 100644
index 00000000000000..667ccd132d2fb8
Binary files /dev/null and b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.o differ
diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.proftext b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.proftext
new file mode 100644
index 00000000000000..35ecc42b5802a6
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.proftext
@@ -0,0 +1,62 @@
+func
+# Func Hash:
+395201011017399473
+# Num Counters:
+22
+# Counter Values:
+1
+1
+0
+0
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+0
+1
+1
+1
+0
+0
+0
+0
+# Num Bitmap Bytes:
+$13
+# Bitmap Byte Values:
+0x0
+0x0
+0x0
+0x20
+0x8
+0x0
+0x20
+0x0
+0x0
+0x0
+0x0
+0x0
+0x0
+
+
+func1
+# Func Hash:
+24
+# Num Counters:
+1
+# Counter Values:
+3
+
+main
+# Func Hash:
+24
+# Num Counters:
+1
+# Counter Values:
+1
+
diff --git a/llvm/test/tools/llvm-cov/mcdc-general-none.test b/llvm/test/tools/llvm-cov/mcdc-general-none.test
index bcf8f3cbd05d45..a373075cc5e37c 100644
--- a/llvm/test/tools/llvm-cov/mcdc-general-none.test
+++ b/llvm/test/tools/llvm-cov/mcdc-general-none.test
@@ -52,7 +52,7 @@
 // Test html output.
 // RUN: llvm-cov show --show-mcdc-summary --show-mcdc %S/Inputs/mcdc-general.o -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs %S/Inputs/mcdc-general.cpp -format html -o %t.html.dir
 // RUN: FileCheck -check-prefix=HTML -input-file=%t.html.dir/coverage/tmp/mcdc-general.cpp.html %s
-// HTML-COUNT-4: MC/DC Decision Region (<span class='line-number'><a name='L{{[0-9]+}}' href='#L{{[0-9]+}}'><span>
+// HTML-COUNT-4: MC/DC Decision Region (<span class='line-number'><a href='#L{{[0-9]+}}'><span>
 
 // RUN: FileCheck -check-prefix HTML-INDEX -input-file %t.html.dir/index.html %s
 // HTML-INDEX-LABEL: <table>
diff --git a/llvm/test/tools/llvm-cov/mcdc-general.test b/llvm/test/tools/llvm-cov/mcdc-general.test
index 588aed09c16a5e..ded2f3eb1c9a5d 100644
--- a/llvm/test/tools/llvm-cov/mcdc-general.test
+++ b/llvm/test/tools/llvm-cov/mcdc-general.test
@@ -118,7 +118,7 @@
 // Test html output.
 // RUN: llvm-cov show --show-mcdc-summary --show-mcdc %S/Inputs/mcdc-general.o -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs %S/Inputs/mcdc-general.cpp -format html -o %t.html.dir
 // RUN: FileCheck -check-prefix=HTML -input-file=%t.html.dir/coverage/tmp/mcdc-general.cpp.html %s
-// HTML-COUNT-4: MC/DC Decision Region (<span class='line-number'><a name='L{{[0-9]+}}' href='#L{{[0-9]+}}'><span>
+// HTML-COUNT-4: MC/DC Decision Region (<span class='line-number'><a href='#L{{[0-9]+}}'><span>
 
 // RUN: FileCheck -check-prefix HTML-INDEX -input-file %t.html.dir/index.html %s
 // HTML-INDEX-LABEL: <table>
diff --git a/llvm/test/tools/llvm-cov/mcdc-macro.test b/llvm/test/tools/llvm-cov/mcdc-macro.test
new file mode 100644
index 00000000000000..339284bba2c9bd
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/mcdc-macro.test
@@ -0,0 +1,99 @@
+// Test visualization of MC/DC constructs for branches in macro expansions.
+
+// RUN: llvm-profdata merge %S/Inputs/mcdc-macro.proftext -o %t.profdata
+// RUN: llvm-cov show --show-expansions --show-branches=count --show-mcdc %S/Inputs/mcdc-macro.o -instr-profile %t.profdata --compilation-dir=%S/Inputs | FileCheck %s
+
+// CHECK:  |  |  |  Branch (2:11): [Folded - Ignored]
+// CHECK:  |  |  |  Branch (3:11): [True: 1, False: 0]
+// CHECK:  |  |  |  Branch (3:23): [True: 1, False: 0]
+// CHECK:  |  Branch (9:7): [True: 1, False: 0]
+// CHECK-NEXT:  |  Branch (9:22): [True: 0, False: 0]
+// CHECK-NEXT:  ------------------
+// CHECK-NEXT:  |---> MC/DC Decision Region (9:7) to (9:23)
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |  Number of Conditions: 5
+// CHECK-NEXT:  |     Condition C1 --> (9:7)
+// CHECK-NEXT:  |     Condition C2 --> (9:22)
+// CHECK-NEXT:  |     Condition C3 --> (2:11)
+// CHECK-NEXT:  |     Condition C4 --> (3:11)
+// CHECK-NEXT:  |     Condition C5 --> (3:23)
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |  Executed MC/DC Test Vectors:
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |     C1, C2, C3, C4, C5    Result
+// CHECK-NEXT:  |  1 { T,  -,  C,  T,  T  = T      }
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |  C1-Pair: not covered
+// CHECK-NEXT:  |  C2-Pair: not covered
+// CHECK-NEXT:  |  C3-Pair: constant folded
+// CHECK-NEXT:  |  C4-Pair: not covered
+// CHECK-NEXT:  |  C5-Pair: not covered
+// CHECK-NEXT:  |  MC/DC Coverage for Decision: 0.00%
+// CHECK-NEXT:  |
+// CHECK-NEXT:  ------------------
+
+// CHECK:  |  |  |  Branch (2:11): [Folded - Ignored]
+// CHECK:  |  Branch (11:7): [True: 1, False: 0]
+// CHECK-NEXT:  ------------------
+// CHECK-NEXT:  |---> MC/DC Decision Region (11:7) to (11:13)
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |  Number of Conditions: 2
+// CHECK-NEXT:  |     Condition C1 --> (11:7)
+// CHECK-NEXT:  |     Condition C2 --> (2:11)
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |  Executed MC/DC Test Vectors:
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |     C1, C2    Result
+// CHECK-NEXT:  |  1 { T,  C  = T      }
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |  C1-Pair: not covered
+// CHECK-NEXT:  |  C2-Pair: constant folded
+// CHECK-NEXT:  |  MC/DC Coverage for Decision: 0.00%
+// CHECK-NEXT:  |
+// CHECK-NEXT:  ------------------
+
+// CHECK:  |  |  |  Branch (1:11): [True: 1, False: 0]
+// CHECK:  |  |  |  Branch (2:11): [Folded - Ignored]
+// CHECK:  |  |  |  |  |  Branch (3:11): [True: 0, False: 0]
+// CHECK:  |  |  |  |  |  Branch (3:23): [True: 0, False: 0]
+// CHECK:  |  Branch (13:7): [True: 1, False: 0]
+// CHECK-NEXT:  |  Branch (13:13): [True: 1, False: 0]
+// CHECK-NEXT:  ------------------
+// CHECK-NEXT:  |---> MC/DC Decision Region (13:7) to (13:32)
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |  Number of Conditions: 6
+// CHECK-NEXT:  |     Condition C1 --> (13:7)
+// CHECK-NEXT:  |     Condition C2 --> (13:13)
+// CHECK-NEXT:  |     Condition C3 --> (1:11)
+// CHECK-NEXT:  |     Condition C4 --> (2:11)
+// CHECK-NEXT:  |     Condition C5 --> (3:11)
+// CHECK-NEXT:  |     Condition C6 --> (3:23)
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |  Executed MC/DC Test Vectors:
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |     C1, C2, C3, C4, C5, C6    Result
+// CHECK-NEXT:  |  1 { T,  T,  T,  C,  -,  -  = T      }
+// CHECK-NEXT:  |
+// CHECK-NEXT:  |  C1-Pair: not covered
+// CHECK-NEXT:  |  C2-Pair: not covered
+// CHECK-NEXT:  |  C3-Pair: not covered
+// CHECK-NEXT:  |  C4-Pair: constant folded
+// CHECK-NEXT:  |  C5-Pair: not covered
+// CHECK-NEXT:  |  C6-Pair: not covered
+// CHECK-NEXT:  |  MC/DC Coverage for Decision: 0.00%
+// CHECK-NEXT:  |
+// CHECK-NEXT:  ------------------
+
+Instructions for regenerating the test:
+
+cd %S/Inputs # Or copy mcdc-macro.c into the working directory
+
+clang -fcoverage-mcdc -fprofile-instr-generate -fcoverage-compilation-dir=. \
+    -O3 -mllvm -enable-name-compression=false \
+    -fcoverage-mapping mcdc-macro.c -c
+
+# Instructions for generating proftext
+clang -fprofile-instr-generate mcdc-macro.o
+./a.out
+llvm-profdata merge --sparse -o default.profdata default.profraw
+llvm-profdata merge --text -o mcdc-macro.proftext default.profdata
diff --git a/llvm/test/tools/llvm-dlltool/coff-decorated.def b/llvm/test/tools/llvm-dlltool/coff-decorated.def
index 856804686168b1..fc81f23d09d6c4 100644
--- a/llvm/test/tools/llvm-dlltool/coff-decorated.def
+++ b/llvm/test/tools/llvm-dlltool/coff-decorated.def
@@ -14,25 +14,32 @@ OtherStdcallExportName@4=CdeclInternalFunction
 CdeclExportName=StdcallInternalFunction@4
 
 ; CHECK:      Name type: noprefix
+; CHECK-NEXT: Export name: CdeclFunction
 ; CHECK-NEXT: Symbol: __imp__CdeclFunction
 ; CHECK-NEXT: Symbol: _CdeclFunction
 ; CHECK:      Name type: undecorate
+; CHECK-NEXT: Export name: StdcallFunction
 ; CHECK-NEXT: Symbol: __imp__StdcallFunction@4
 ; CHECK-NEXT: Symbol: _StdcallFunction@4
 ; CHECK:      Name type: undecorate
+; CHECK-NEXT: Export name: FastcallFunction
 ; CHECK-NEXT: Symbol: __imp_@FastcallFunction@4
 ; CHECK-NEXT: Symbol: @FastcallFunction@4
 ; CHECK:      Name type: name
+; CHECK-NEXT: Export name: ??_7exception@@6B@
 ; CHECK-NEXT: Symbol: __imp_??_7exception@@6B@
 ; CHECK-NEXT: Symbol: ??_7exception@@6B@
 ; CHECK-NM: W _StdcallAlias@4
 ; CHECK-NM: U _StdcallFunction@4
 ; CHECK:      Name type: undecorate
+; CHECK-NEXT: Export name: StdcallExportName
 ; CHECK-NEXT: Symbol: __imp__StdcallExportName@4{{$}}
 ; CHECK-NEXT: Symbol: _StdcallExportName@4{{$}}
 ; CHECK:      Name type: undecorate
+; CHECK-NEXT: Export name: OtherStdcallExportName
 ; CHECK-NEXT: Symbol: __imp__OtherStdcallExportName@4{{$}}
 ; CHECK-NEXT: Symbol: _OtherStdcallExportName@4{{$}}
 ; CHECK:      Name type: noprefix
+; CHECK-NEXT: Export name: CdeclExportName
 ; CHECK-NEXT: Symbol: __imp__CdeclExportName
 ; CHECK-NEXT: Symbol: _CdeclExportName
diff --git a/llvm/test/tools/llvm-dlltool/coff-exports.def b/llvm/test/tools/llvm-dlltool/coff-exports.def
index 57c55744602156..267424db1b8c1d 100644
--- a/llvm/test/tools/llvm-dlltool/coff-exports.def
+++ b/llvm/test/tools/llvm-dlltool/coff-exports.def
@@ -17,12 +17,15 @@ AnotherFunction
 ; CHECK-ARM64: Format: COFF-import-file-ARM64
 ; CHECK: Type: code
 ; CHECK:      Name type: name
+; CHECK-NEXT: Export name: TestFunction1
 ; CHECK-NEXT: Symbol: __imp_TestFunction1
 ; CHECK-NEXT: Symbol: TestFunction1
 ; CHECK:      Name type: name
+; CHECK-NEXT: Export name: TestFunction2
 ; CHECK-NEXT: Symbol: __imp_TestFunction2{{$}}
 ; CHECK-NEXT: Symbol: TestFunction2{{$}}
 ; CHECK:      Name type: name
+; CHECK-NEXT: Export name: TestFunction3
 ; CHECK-NEXT: Symbol: __imp_TestFunction3{{$}}
 ; CHECK-NEXT: Symbol: TestFunction3{{$}}
 
diff --git a/llvm/test/tools/llvm-dlltool/coff-noname.def b/llvm/test/tools/llvm-dlltool/coff-noname.def
index 27e60efbd2d802..7cb05846ce28a2 100644
--- a/llvm/test/tools/llvm-dlltool/coff-noname.def
+++ b/llvm/test/tools/llvm-dlltool/coff-noname.def
@@ -12,5 +12,6 @@ ByNameFunction
 ; CHECK-NEXT: Symbol: __imp__ByOrdinalFunction
 ; CHECK-NEXT: Symbol: _ByOrdinalFunction
 ; CHECK:      Name type: noprefix
+; CHECK-NEXT: Export name: ByNameFunction
 ; CHECK-NEXT: Symbol: __imp__ByNameFunction
 ; CHECK-NEXT: Symbol: _ByNameFunction
diff --git a/llvm/test/tools/llvm-dlltool/no-leading-underscore.def b/llvm/test/tools/llvm-dlltool/no-leading-underscore.def
index 6b78e15d2b5f69..9c5e77ca29a821 100644
--- a/llvm/test/tools/llvm-dlltool/no-leading-underscore.def
+++ b/llvm/test/tools/llvm-dlltool/no-leading-underscore.def
@@ -9,9 +9,11 @@ alias == func
 DecoratedFunction@4
 
 ; CHECK:      Name type: name
+; CHECK-NEXT: Export name: func
 ; CHECK-NEXT: Symbol: __imp_func
 ; CHECK-NEXT: Symbol: func
 ; CHECK:      Name type: undecorate
+; CHECK-NEXT: Export name: DecoratedFunction
 ; CHECK-NEXT: Symbol: __imp_DecoratedFunction@4
 ; CHECK-NEXT: Symbol: DecoratedFunction@4
 
diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test
index 2672f8d38b7f70..ebc1b166ee4ea6 100644
--- a/llvm/test/tools/llvm-lib/arm64ec-implib.test
+++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test
@@ -11,9 +11,26 @@ ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
 ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
 ARMAP-EMPTY:
 ARMAP-NEXT: Archive EC map
+ARMAP-NEXT: #expname in test.dll
+ARMAP-NEXT: #funcexp in test.dll
+ARMAP-NEXT: #mangledfunc in test.dll
+ARMAP-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test.dll
+ARMAP-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAP-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
+ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
+ARMAP-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAP-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAP-NEXT: __imp_aux_expname in test.dll
+ARMAP-NEXT: __imp_aux_funcexp in test.dll
+ARMAP-NEXT: __imp_aux_mangledfunc in test.dll
 ARMAP-NEXT: __imp_dataexp in test.dll
+ARMAP-NEXT: __imp_expname in test.dll
 ARMAP-NEXT: __imp_funcexp in test.dll
+ARMAP-NEXT: __imp_mangledfunc in test.dll
+ARMAP-NEXT: expname in test.dll
 ARMAP-NEXT: funcexp in test.dll
+ARMAP-NEXT: mangledfunc in test.dll
+ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
 
 RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s
 
@@ -35,14 +52,48 @@ READOBJ-EMPTY:
 READOBJ-NEXT: File: test.dll
 READOBJ-NEXT: Format: COFF-import-file-ARM64EC
 READOBJ-NEXT: Type: code
-READOBJ-NEXT: Name type: name
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: funcexp
 READOBJ-NEXT: Symbol: __imp_funcexp
 READOBJ-NEXT: Symbol: funcexp
+READOBJ-NEXT: Symbol: __imp_aux_funcexp
+READOBJ-NEXT: Symbol: #funcexp
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: mangledfunc
+READOBJ-NEXT: Symbol: __imp_mangledfunc
+READOBJ-NEXT: Symbol: mangledfunc
+READOBJ-NEXT: Symbol: __imp_aux_mangledfunc
+READOBJ-NEXT: Symbol: #mangledfunc
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: ?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: __imp_?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: ?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: __imp_aux_?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: ?test_cpp_func@@$$hYAHPEAX@Z
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expname
+READOBJ-NEXT: Symbol: __imp_expname
+READOBJ-NEXT: Symbol: expname
+READOBJ-NEXT: Symbol: __imp_aux_expname
+READOBJ-NEXT: Symbol: #expname
 READOBJ-EMPTY:
 READOBJ-NEXT: File: test.dll
 READOBJ-NEXT: Format: COFF-import-file-ARM64EC
 READOBJ-NEXT: Type: data
 READOBJ-NEXT: Name type: name
+READOBJ-NEXT: Export name: dataexp
 READOBJ-NEXT: Symbol: __imp_dataexp
 
 Creating a new lib containing the existing lib:
@@ -53,4 +104,7 @@ RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAP %s
 LIBRARY test.dll
 EXPORTS
     funcexp
+    #mangledfunc
+    ?test_cpp_func@@YAHPEAX@Z
+    expname=impname
     dataexp DATA
diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/basic-instructions.s
new file mode 100644
index 00000000000000..7dd05eb50085c8
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/basic-instructions.s
@@ -0,0 +1,3724 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=ampere1b -instruction-tables < %s | FileCheck %s
+
+#------------------------------------------------------------------------------
+# Add/sub (immediate)
+#------------------------------------------------------------------------------
+
+add      w2, w3, #4095
+add      w30, w29, #1, lsl #12
+add      w13, w5, #4095, lsl #12
+add      x5, x7, #1638
+add      w20, wsp, #801
+add      wsp, wsp, #1104
+add      wsp, w30, #4084
+add      x0, x24, #291
+add      x3, x24, #4095, lsl #12
+add      x8, sp, #1074
+add      sp, x29, #3816
+sub      w0, wsp, #4077
+sub      w4, w20, #546, lsl #12
+sub      sp, sp, #288
+sub      wsp, w19, #16
+adds     w13, w23, #291, lsl #12
+cmn      w2, #4095
+adds     w20, wsp, #0
+cmn      x3, #1, lsl #12
+cmp      sp, #20, lsl #12
+cmp      x30, #4095
+subs     x4, sp, #3822
+cmn      w3, #291, lsl #12
+cmn      wsp, #1365
+cmn      sp, #1092, lsl #12
+mov      sp, x30
+mov      wsp, w20
+mov      x11, sp
+mov      w24, wsp
+
+#------------------------------------------------------------------------------
+# Add-subtract (shifted register)
+#------------------------------------------------------------------------------
+
+add      w3, w5, w7
+add      wzr, w3, w5
+add      w20, wzr, w4
+add      w4, w6, wzr
+add      w11, w13, w15
+add      w9, w3, wzr, lsl #10
+add      w17, w29, w20, lsl #31
+add      w21, w22, w23, lsr #0
+add      w24, w25, w26, lsr #18
+add      w27, w28, w29, lsr #31
+add      w2, w3, w4, asr #0
+add      w5, w6, w7, asr #21
+add      w8, w9, w10, asr #31
+add      x3, x5, x7
+add      xzr, x3, x5
+add      x20, xzr, x4
+add      x4, x6, xzr
+add      x11, x13, x15
+add      x9, x3, xzr, lsl #10
+add      x17, x29, x20, lsl #63
+add      x21, x22, x23, lsr #0
+add      x24, x25, x26, lsr #18
+add      x27, x28, x29, lsr #63
+add      x2, x3, x4, asr #0
+add      x5, x6, x7, asr #21
+add      x8, x9, x10, asr #63
+adds     w3, w5, w7
+cmn      w3, w5
+adds     w20, wzr, w4
+adds     w4, w6, wzr
+adds     w11, w13, w15
+adds     w9, w3, wzr, lsl #10
+adds     w17, w29, w20, lsl #31
+adds     w21, w22, w23, lsr #0
+adds     w24, w25, w26, lsr #18
+adds     w27, w28, w29, lsr #31
+adds     w2, w3, w4, asr #0
+adds     w5, w6, w7, asr #21
+adds     w8, w9, w10, asr #31
+adds     x3, x5, x7
+cmn      x3, x5
+adds     x20, xzr, x4
+adds     x4, x6, xzr
+adds     x11, x13, x15
+adds     x9, x3, xzr, lsl #10
+adds     x17, x29, x20, lsl #63
+adds     x21, x22, x23, lsr #0
+adds     x24, x25, x26, lsr #18
+adds     x27, x28, x29, lsr #63
+adds     x2, x3, x4, asr #0
+adds     x5, x6, x7, asr #21
+adds     x8, x9, x10, asr #63
+sub      w3, w5, w7
+sub      wzr, w3, w5
+sub      w4, w6, wzr
+sub      w11, w13, w15
+sub      w9, w3, wzr, lsl #10
+sub      w17, w29, w20, lsl #31
+sub      w21, w22, w23, lsr #0
+sub      w24, w25, w26, lsr #18
+sub      w27, w28, w29, lsr #31
+sub      w2, w3, w4, asr #0
+sub      w5, w6, w7, asr #21
+sub      w8, w9, w10, asr #31
+sub      x3, x5, x7
+sub      xzr, x3, x5
+sub      x4, x6, xzr
+sub      x11, x13, x15
+sub      x9, x3, xzr, lsl #10
+sub      x17, x29, x20, lsl #63
+sub      x21, x22, x23, lsr #0
+sub      x24, x25, x26, lsr #18
+sub      x27, x28, x29, lsr #63
+sub      x2, x3, x4, asr #0
+sub      x5, x6, x7, asr #21
+sub      x8, x9, x10, asr #63
+subs     w3, w5, w7
+cmp      w3, w5
+subs     w4, w6, wzr
+subs     w11, w13, w15
+subs     w9, w3, wzr, lsl #10
+subs     w17, w29, w20, lsl #31
+subs     w21, w22, w23, lsr #0
+subs     w24, w25, w26, lsr #18
+subs     w27, w28, w29, lsr #31
+subs     w2, w3, w4, asr #0
+subs     w5, w6, w7, asr #21
+subs     w8, w9, w10, asr #31
+subs     x3, x5, x7
+cmp      x3, x5
+subs     x4, x6, xzr
+subs     x11, x13, x15
+subs     x9, x3, xzr, lsl #10
+subs     x17, x29, x20, lsl #63
+subs     x21, x22, x23, lsr #0
+subs     x24, x25, x26, lsr #18
+subs     x27, x28, x29, lsr #63
+subs     x2, x3, x4, asr #0
+subs     x5, x6, x7, asr #21
+subs     x8, x9, x10, asr #63
+cmn      wzr, w4
+cmn      w5, wzr
+cmn      w6, w7
+cmn      w8, w9, lsl #15
+cmn      w10, w11, lsl #31
+cmn      w12, w13, lsr #0
+cmn      w14, w15, lsr #21
+cmn      w16, w17, lsr #31
+cmn      w18, w19, asr #0
+cmn      w20, w21, asr #22
+cmn      w22, w23, asr #31
+cmn      x0, x3
+cmn      xzr, x4
+cmn      x5, xzr
+cmn      x6, x7
+cmn      x8, x9, lsl #15
+cmn      x10, x11, lsl #63
+cmn      x12, x13, lsr #0
+cmn      x14, x15, lsr #41
+cmn      x16, x17, lsr #63
+cmn      x18, x19, asr #0
+cmn      x20, x21, asr #55
+cmn      x22, x23, asr #63
+cmp      w0, w3
+cmp      wzr, w4
+cmp      w5, wzr
+cmp      w6, w7
+cmp      w8, w9, lsl #15
+cmp      w10, w11, lsl #31
+cmp      w12, w13, lsr #0
+cmp      w14, w15, lsr #21
+cmp      w18, w19, asr #0
+cmp      w20, w21, asr #22
+cmp      w22, w23, asr #31
+cmp      x0, x3
+cmp      xzr, x4
+cmp      x5, xzr
+cmp      x6, x7
+cmp      x8, x9, lsl #15
+cmp      x10, x11, lsl #63
+cmp      x12, x13, lsr #0
+cmp      x14, x15, lsr #41
+cmp      x16, x17, lsr #63
+cmp      x18, x19, asr #0
+cmp      x20, x21, asr #55
+cmp      x22, x23, asr #63
+cmp      wzr, w0
+cmp      xzr, x0
+
+#------------------------------------------------------------------------------
+# Add-subtract (shifted register)
+#------------------------------------------------------------------------------
+
+adc      w29, w27, w25
+adc      wzr, w3, w4
+adc      w9, wzr, w10
+adc      w20, w0, wzr
+adc      x29, x27, x25
+adc      xzr, x3, x4
+adc      x9, xzr, x10
+adc      x20, x0, xzr
+adcs     w29, w27, w25
+adcs     wzr, w3, w4
+adcs     w9, wzr, w10
+adcs     w20, w0, wzr
+adcs     x29, x27, x25
+adcs     xzr, x3, x4
+adcs     x9, xzr, x10
+adcs     x20, x0, xzr
+sbc      w29, w27, w25
+sbc      wzr, w3, w4
+ngc      w9, w10
+sbc      w20, w0, wzr
+sbc      x29, x27, x25
+sbc      xzr, x3, x4
+ngc      x9, x10
+sbc      x20, x0, xzr
+sbcs     w29, w27, w25
+sbcs     wzr, w3, w4
+ngcs     w9, w10
+sbcs     w20, w0, wzr
+sbcs     x29, x27, x25
+sbcs     xzr, x3, x4
+ngcs     x9, x10
+sbcs     x20, x0, xzr
+ngc      w3, w12
+ngc      wzr, w9
+ngc      w23, wzr
+ngc      x29, x30
+ngc      xzr, x0
+ngc      x0, xzr
+ngcs     w3, w12
+ngcs     wzr, w9
+ngcs     w23, wzr
+ngcs     x29, x30
+ngcs     xzr, x0
+ngcs     x0, xzr
+
+#------------------------------------------------------------------------------
+# Compare and branch (immediate)
+#------------------------------------------------------------------------------
+
+sbfx     x1, x2, #3, #2
+asr      x3, x4, #63
+asr      wzr, wzr, #31
+sbfx     w12, w9, #0, #1
+ubfiz    x4, x5, #52, #11
+ubfx     xzr, x4, #0, #1
+ubfiz    x4, xzr, #1, #6
+lsr      x5, x6, #12
+bfi      x4, x5, #52, #11
+bfxil    xzr, x4, #0, #1
+bfi      x4, xzr, #1, #6
+bfxil    x5, x6, #12, #52
+sxtb     w1, w2
+sxtb     xzr, w3
+sxth     w9, w10
+sxth     x0, w1
+sxtw     x3, w30
+uxtb     w1, w2
+uxth     w9, w10
+ubfx     x3, x30, #0, #32
+asr      w3, w2, #0
+asr      w9, w10, #31
+asr      x20, x21, #63
+asr      w1, wzr, #3
+lsr      w3, w2, #0
+lsr      w9, w10, #31
+lsr      x20, x21, #63
+lsr      wzr, wzr, #3
+lsr      w3, w2, #0
+lsl      w9, w10, #31
+lsl      x20, x21, #63
+lsl      w1, wzr, #3
+sbfx     w9, w10, #0, #1
+sbfiz    x2, x3, #63, #1
+asr      x19, x20, #0
+sbfiz    x9, x10, #5, #59
+asr      w9, w10, #0
+sbfiz    w11, w12, #31, #1
+sbfiz    w13, w14, #29, #3
+sbfiz    xzr, xzr, #10, #11
+sbfx     w9, w10, #0, #1
+asr      x2, x3, #63
+asr      x19, x20, #0
+asr      x9, x10, #5
+asr      w9, w10, #0
+asr      w11, w12, #31
+asr      w13, w14, #29
+sbfx     xzr, xzr, #10, #11
+bfxil    w9, w10, #0, #1
+bfi      x2, x3, #63, #1
+bfxil    x19, x20, #0, #64
+bfi      x9, x10, #5, #59
+bfxil    w9, w10, #0, #32
+bfi      w11, w12, #31, #1
+bfi      w13, w14, #29, #3
+bfi      xzr, xzr, #10, #11
+bfxil    w9, w10, #0, #1
+bfxil    x2, x3, #63, #1
+bfxil    x19, x20, #0, #64
+bfxil    x9, x10, #5, #59
+bfxil    w9, w10, #0, #32
+bfxil    w11, w12, #31, #1
+bfxil    w13, w14, #29, #3
+bfxil    xzr, xzr, #10, #11
+ubfx     w9, w10, #0, #1
+lsl      x2, x3, #63
+lsr      x19, x20, #0
+lsl      x9, x10, #5
+lsr      w9, w10, #0
+lsl      w11, w12, #31
+lsl      w13, w14, #29
+ubfiz    xzr, xzr, #10, #11
+ubfx     w9, w10, #0, #1
+lsr      x2, x3, #63
+lsr      x19, x20, #0
+lsr      x9, x10, #5
+lsr      w9, w10, #0
+lsr      w11, w12, #31
+lsr      w13, w14, #29
+ubfx     xzr, xzr, #10, #11
+
+#------------------------------------------------------------------------------
+# Compare and branch (immediate)
+#------------------------------------------------------------------------------
+
+cbz      w5, #4
+cbz      x5, #0
+cbnz     x2, #-4
+cbnz     x26, #1048572
+cbz      wzr, #0
+cbnz     xzr, #0
+
+#------------------------------------------------------------------------------
+# Conditional branch (immediate)
+#------------------------------------------------------------------------------
+
+b.ne #4
+b.ge #1048572
+b.ge #-4
+
+#------------------------------------------------------------------------------
+# Conditional compare (immediate)
+#------------------------------------------------------------------------------
+
+ccmp w1, #31, #0, eq
+ccmp w3, #0, #15, hs
+ccmp wzr, #15, #13, hs
+ccmp x9, #31, #0, le
+ccmp x3, #0, #15, gt
+ccmp xzr, #5, #7, ne
+ccmn w1, #31, #0, eq
+ccmn w3, #0, #15, hs
+ccmn wzr, #15, #13, hs
+ccmn x9, #31, #0, le
+ccmn x3, #0, #15, gt
+ccmn xzr, #5, #7, ne
+
+#------------------------------------------------------------------------------
+# Conditional compare (register)
+#------------------------------------------------------------------------------
+
+ccmp w1, wzr, #0, eq
+ccmp w3, w0, #15, hs
+ccmp wzr, w15, #13, hs
+ccmp x9, xzr, #0, le
+ccmp x3, x0, #15, gt
+ccmp xzr, x5, #7, ne
+ccmn w1, wzr, #0, eq
+ccmn w3, w0, #15, hs
+ccmn wzr, w15, #13, hs
+ccmn x9, xzr, #0, le
+ccmn x3, x0, #15, gt
+ccmn xzr, x5, #7, ne
+
+#------------------------------------------------------------------------------
+# Conditional branch (immediate)
+#------------------------------------------------------------------------------
+
+csel     w1, w0, w19, ne
+csel     wzr, w5, w9, eq
+csel     w9, wzr, w30, gt
+csel     w1, w28, wzr, mi
+csel     x19, x23, x29, lt
+csel     xzr, x3, x4, ge
+csel     x5, xzr, x6, hs
+csel     x7, x8, xzr, lo
+csinc    w1, w0, w19, ne
+csinc    wzr, w5, w9, eq
+csinc    w9, wzr, w30, gt
+csinc    w1, w28, wzr, mi
+csinc    x19, x23, x29, lt
+csinc    xzr, x3, x4, ge
+csinc    x5, xzr, x6, hs
+csinc    x7, x8, xzr, lo
+csinv    w1, w0, w19, ne
+csinv    wzr, w5, w9, eq
+csinv    w9, wzr, w30, gt
+csinv    w1, w28, wzr, mi
+csinv    x19, x23, x29, lt
+csinv    xzr, x3, x4, ge
+csinv    x5, xzr, x6, hs
+csinv    x7, x8, xzr, lo
+csneg    w1, w0, w19, ne
+csneg    wzr, w5, w9, eq
+csneg    w9, wzr, w30, gt
+csneg    w1, w28, wzr, mi
+csneg    x19, x23, x29, lt
+csneg    xzr, x3, x4, ge
+csneg    x5, xzr, x6, hs
+csneg    x7, x8, xzr, lo
+cset    w3, eq
+cset    x9, pl
+csetm    w20, ne
+csetm    x30, ge
+csinc    w2, wzr, wzr, al
+csinv    x3, xzr, xzr, nv
+cinc    w3, w5, gt
+cinc    wzr, w4, le
+cset    w9, lt
+cinc    x3, x5, gt
+cinc    xzr, x4, le
+cset    x9, lt
+csinc   w5, w6, w6, nv
+csinc   x1, x2, x2, al
+cinv    w3, w5, gt
+cinv    wzr, w4, le
+csetm   w9, lt
+cinv    x3, x5, gt
+cinv    xzr, x4, le
+csetm   x9, lt
+csinv   x1, x0, x0, al
+csinv   w9, w8, w8, nv
+cneg     w3, w5, gt
+cneg     wzr, w4, le
+cneg     w9, wzr, lt
+cneg     x3, x5, gt
+cneg     xzr, x4, le
+cneg     x9, xzr, lt
+csneg    x4, x8, x8, al
+csinv    w9, w8, w8, nv
+
+#------------------------------------------------------------------------------
+# Data-processing (1 source)
+#------------------------------------------------------------------------------
+
+rbit	w0, w7
+rbit   x18, x3
+rev16	w17, w1
+rev16	x5, x2
+rev	w18, w0
+rev32	x20, x1
+rev	x22, x2
+clz	w24, w3
+clz	x26, x4
+cls	w3, w5
+cls	x20, x5
+
+#------------------------------------------------------------------------------
+# Data-processing (2 source)
+#------------------------------------------------------------------------------
+
+udiv	w0, w7, w10
+udiv	x9, x22, x4
+sdiv	w12, w21, w0
+sdiv	x13, x2, x1
+lsl	w11, w12, w13
+lsl	x14, x15, x16
+lsr	w17, w18, w19
+lsr	x20, x21, x22
+asr	w23, w24, w25
+asr	x26, x27, x28
+ror	w0, w1, w2
+ror    x3, x4, x5
+lsl	w6, w7, w8
+lsl	x9, x10, x11
+lsr	w12, w13, w14
+lsr	x15, x16, x17
+asr	w18, w19, w20
+asr	x21, x22, x23
+ror	w24, w25, w26
+ror	x27, x28, x29
+
+#------------------------------------------------------------------------------
+# Data-processing (3 sources)
+#------------------------------------------------------------------------------
+
+smulh    x30, x29, x28
+smulh    xzr, x27, x26
+umulh    x30, x29, x28
+umulh    x23, x30, xzr
+madd     w1, w3, w7, w4
+madd     wzr, w0, w9, w11
+madd     w13, wzr, w4, w4
+madd     w19, w30, wzr, w29
+mul      w4, w5, w6
+madd     x1, x3, x7, x4
+madd     xzr, x0, x9, x11
+madd     x13, xzr, x4, x4
+madd     x19, x30, xzr, x29
+mul      x4, x5, x6
+msub     w1, w3, w7, w4
+msub     wzr, w0, w9, w11
+msub     w13, wzr, w4, w4
+msub     w19, w30, wzr, w29
+mneg     w4, w5, w6
+msub     x1, x3, x7, x4
+msub     xzr, x0, x9, x11
+msub     x13, xzr, x4, x4
+msub     x19, x30, xzr, x29
+mneg     x4, x5, x6
+smaddl   x3, w5, w2, x9
+smaddl   xzr, w10, w11, x12
+smaddl   x13, wzr, w14, x15
+smaddl   x16, w17, wzr, x18
+smull    x19, w20, w21
+smsubl   x3, w5, w2, x9
+smsubl   xzr, w10, w11, x12
+smsubl   x13, wzr, w14, x15
+smsubl   x16, w17, wzr, x18
+smnegl   x19, w20, w21
+umaddl   x3, w5, w2, x9
+umaddl   xzr, w10, w11, x12
+umaddl   x13, wzr, w14, x15
+umaddl   x16, w17, wzr, x18
+umull    x19, w20, w21
+umsubl   x3, w5, w2, x9
+umsubl   x16, w17, wzr, x18
+umnegl   x19, w20, w21
+smulh    x30, x29, x28
+smulh    x23, x22, xzr
+umulh    x23, x22, xzr
+mul      x19, x20, xzr
+mneg     w21, w22, w23
+smull    x11, w13, w17
+umull    x11, w13, w17
+smnegl   x11, w13, w17
+umnegl   x11, w13, w17
+
+#------------------------------------------------------------------------------
+# Extract (immediate)
+#------------------------------------------------------------------------------
+
+extr     w3, w5, w7, #0
+extr     w11, w13, w17, #31
+extr     x3, x5, x7, #15
+extr     x11, x13, x17, #63
+ror     x19, x23, #24
+ror     x29, xzr, #63
+ror     w9, w13, #31
+
+#------------------------------------------------------------------------------
+# Floating-point compare
+#------------------------------------------------------------------------------
+
+fcmp    s3, s5
+fcmp    s31, #0.0
+fcmp    s31, #0.0
+fcmpe   s29, s30
+fcmpe   s15, #0.0
+fcmpe   s15, #0.0
+fcmp    d4, d12
+fcmp    d23, #0.0
+fcmp    d23, #0.0
+fcmpe   d26, d22
+fcmpe   d29, #0.0
+fcmpe   d29, #0.0
+
+#------------------------------------------------------------------------------
+# Floating-point conditional compare
+#------------------------------------------------------------------------------
+
+fccmp s1, s31, #0, eq
+fccmp s3, s0, #15, hs
+fccmp s31, s15, #13, hs
+fccmp d9, d31, #0, le
+fccmp d3, d0, #15, gt
+fccmp d31, d5, #7, ne
+fccmpe s1, s31, #0, eq
+fccmpe s3, s0, #15, hs
+fccmpe s31, s15, #13, hs
+fccmpe d9, d31, #0, le
+fccmpe d3, d0, #15, gt
+fccmpe d31, d5, #7, ne
+
+#-------------------------------------------------------------------------------
+# Floating-point conditional compare
+#-------------------------------------------------------------------------------
+
+fcsel s3, s20, s9, pl
+fcsel d9, d10, d11, mi
+
+#------------------------------------------------------------------------------
+# Floating-point data-processing (1 source)
+#------------------------------------------------------------------------------
+
+fmov     s0, s1
+fabs     s2, s3
+fneg     s4, s5
+fsqrt    s6, s7
+fcvt     d8, s9
+fcvt     h10, s11
+frintn   s12, s13
+frintp   s14, s15
+frintm   s16, s17
+frintz   s18, s19
+frinta   s20, s21
+frintx   s22, s23
+frinti   s24, s25
+fmov     d0, d1
+fabs     d2, d3
+fneg     d4, d5
+fsqrt    d6, d7
+fcvt     s8, d9
+fcvt     h10, d11
+frintn   d12, d13
+frintp   d14, d15
+frintm   d16, d17
+frintz   d18, d19
+frinta   d20, d21
+frintx   d22, d23
+frinti   d24, d25
+fcvt     s26, h27
+fcvt     d28, h29
+
+#------------------------------------------------------------------------------
+# Floating-point data-processing (2 sources)
+#------------------------------------------------------------------------------
+
+fmul     s20, s19, s17
+fdiv     s1, s2, s3
+fadd     s4, s5, s6
+fsub     s7, s8, s9
+fmax     s10, s11, s12
+fmin     s13, s14, s15
+fmaxnm   s16, s17, s18
+fminnm   s19, s20, s21
+fnmul    s22, s23, s2
+fmul     d20, d19, d17
+fdiv     d1, d2, d3
+fadd     d4, d5, d6
+fsub     d7, d8, d9
+fmax     d10, d11, d12
+fmin     d13, d14, d15
+fmaxnm   d16, d17, d18
+fminnm   d19, d20, d21
+fnmul    d22, d23, d24
+
+#------------------------------------------------------------------------------
+# Floating-point data-processing (1 source)
+#------------------------------------------------------------------------------
+
+fmadd s3, s5, s6, s31
+fmadd d3, d13, d0, d23
+fmsub s3, s5, s6, s31
+fmsub d3, d13, d0, d23
+fnmadd s3, s5, s6, s31
+fnmadd d3, d13, d0, d23
+fnmsub s3, s5, s6, s31
+fnmsub d3, d13, d0, d23
+
+#------------------------------------------------------------------------------
+# Floating-point <-> fixed-point conversion
+#------------------------------------------------------------------------------
+
+fcvtzs  w3, h5, #1
+fcvtzs  wzr, h20, #13
+fcvtzs  w19, h0, #32
+fcvtzs  x3, h5, #1
+fcvtzs  x12, h30, #45
+fcvtzs  x19, h0, #64
+fcvtzs  w3, s5, #1
+fcvtzs  wzr, s20, #13
+fcvtzs  w19, s0, #32
+fcvtzs  x3, s5, #1
+fcvtzs  x12, s30, #45
+fcvtzs  x19, s0, #64
+fcvtzs  w3, d5, #1
+fcvtzs  wzr, d20, #13
+fcvtzs  w19, d0, #32
+fcvtzs  x3, d5, #1
+fcvtzs  x12, d30, #45
+fcvtzs  x19, d0, #64
+fcvtzu  w3, h5, #1
+fcvtzu  wzr, h20, #13
+fcvtzu  w19, h0, #32
+fcvtzu  x3, h5, #1
+fcvtzu  x12, h30, #45
+fcvtzu  x19, h0, #64
+fcvtzu  w3, s5, #1
+fcvtzu  wzr, s20, #13
+fcvtzu  w19, s0, #32
+fcvtzu  x3, s5, #1
+fcvtzu  x12, s30, #45
+fcvtzu  x19, s0, #64
+fcvtzu  w3, d5, #1
+fcvtzu  wzr, d20, #13
+fcvtzu  w19, d0, #32
+fcvtzu  x3, d5, #1
+fcvtzu  x12, d30, #45
+fcvtzu  x19, d0, #64
+scvtf   h23, w19, #1
+scvtf   h31, wzr, #20
+scvtf   h14, w0, #32
+scvtf   h23, x19, #1
+scvtf   h31, xzr, #20
+scvtf   h14, x0, #64
+scvtf   s23, w19, #1
+scvtf   s31, wzr, #20
+scvtf   s14, w0, #32
+scvtf   s23, x19, #1
+scvtf   s31, xzr, #20
+scvtf   s14, x0, #64
+scvtf   d23, w19, #1
+scvtf   d31, wzr, #20
+scvtf   d14, w0, #32
+scvtf   d23, x19, #1
+scvtf   d31, xzr, #20
+scvtf   d14, x0, #64
+ucvtf   h23, w19, #1
+ucvtf   h31, wzr, #20
+ucvtf   h14, w0, #32
+ucvtf   h23, x19, #1
+ucvtf   h31, xzr, #20
+ucvtf   h14, x0, #64
+ucvtf   s23, w19, #1
+ucvtf   s31, wzr, #20
+ucvtf   s14, w0, #32
+ucvtf   s23, x19, #1
+ucvtf   s31, xzr, #20
+ucvtf   s14, x0, #64
+ucvtf   d23, w19, #1
+ucvtf   d31, wzr, #20
+ucvtf   d14, w0, #32
+ucvtf   d23, x19, #1
+ucvtf   d31, xzr, #20
+ucvtf   d14, x0, #64
+
+#------------------------------------------------------------------------------
+# Floating-point <-> integer conversion
+#------------------------------------------------------------------------------
+
+fcvtns   w3, h31
+fcvtns   xzr, h12
+fcvtnu   wzr, h12
+fcvtnu   x0, h0
+fcvtps   wzr, h9
+fcvtps   x12, h20
+fcvtpu   w30, h23
+fcvtpu   x29, h3
+fcvtms   w2, h3
+fcvtms   x4, h5
+fcvtmu   w6, h7
+fcvtmu   x8, h9
+fcvtzs   w10, h11
+fcvtzs   x12, h13
+fcvtzu   w14, h15
+fcvtzu   x15, h16
+scvtf    h17, w18
+scvtf    h19, x20
+ucvtf    h21, w22
+scvtf    h23, x24
+fcvtas   w25, h26
+fcvtas   x27, h28
+fcvtau   w29, h30
+fcvtau   xzr, h0
+fcvtns   w3, s31
+fcvtns   xzr, s12
+fcvtnu   wzr, s12
+fcvtnu   x0, s0
+fcvtps   wzr, s9
+fcvtps   x12, s20
+fcvtpu   w30, s23
+fcvtpu   x29, s3
+fcvtms   w2, s3
+fcvtms   x4, s5
+fcvtmu   w6, s7
+fcvtmu   x8, s9
+fcvtzs   w10, s11
+fcvtzs   x12, s13
+fcvtzu   w14, s15
+fcvtzu   x15, s16
+scvtf    s17, w18
+scvtf    s19, x20
+ucvtf    s21, w22
+scvtf    s23, x24
+fcvtas   w25, s26
+fcvtas   x27, s28
+fcvtau   w29, s30
+fcvtau   xzr, s0
+fcvtns   w3, d31
+fcvtns   xzr, d12
+fcvtnu   wzr, d12
+fcvtnu   x0, d0
+fcvtps   wzr, d9
+fcvtps   x12, d20
+fcvtpu   w30, d23
+fcvtpu   x29, d3
+fcvtms   w2, d3
+fcvtms   x4, d5
+fcvtmu   w6, d7
+fcvtmu   x8, d9
+fcvtzs   w10, d11
+fcvtzs   x12, d13
+fcvtzu   w14, d15
+fcvtzu   x15, d16
+scvtf    d17, w18
+scvtf    d19, x20
+ucvtf    d21, w22
+ucvtf    d23, x24
+fcvtas   w25, d26
+fcvtas   x27, d28
+fcvtau   w29, d30
+fcvtau   xzr, d0
+fmov     w3, s9
+fmov     s9, w3
+fmov     x20, d31
+fmov     d1, x15
+fmov     x3, v12.d[1]
+fmov     v1.d[1], x19
+
+#------------------------------------------------------------------------------
+# Floating-point immediate
+#------------------------------------------------------------------------------
+
+fmov     s2, #0.12500000
+fmov     s3, #1.00000000
+fmov     d30, #16.00000000
+fmov     s4, #1.06250000
+fmov     d10, #1.93750000
+fmov     s12, #-1.00000000
+fmov     d16, #8.50000000
+
+#------------------------------------------------------------------------------
+# Load-register (literal)
+#------------------------------------------------------------------------------
+
+ldr       w3, #0
+ldr       x29, #4
+ldrsw     xzr, #-4
+ldr       s0, #8
+ldr       d0, #1048572
+ldr       q0, #-1048576
+prfm      pldl1strm, #0
+prfm      #22, #0
+
+#------------------------------------------------------------------------------
+# Load/store exclusive
+#------------------------------------------------------------------------------
+
+stxrb      w18, w8, [sp]
+stxrh      w24, w15, [x16]
+stxr       w5, w6, [x17]
+stxr       w1, x10, [x21]
+ldxrb      w30, [x0]
+ldxrh      w17, [x4]
+ldxr       w22, [sp]
+ldxr       x11, [x29]
+ldxr       x11, [x29]
+ldxr       x11, [x29]
+stxp       w12, w11, w10, [sp]
+stxp       wzr, x27, x9, [x12]
+ldxp       w0, wzr, [sp]
+ldxp       x17, x0, [x18]
+ldxp       x17, x0, [x18]
+stlxrb     w12, w22, [x0]
+stlxrh     w10, w1, [x1]
+stlxr      w9, w2, [x2]
+stlxr      w9, x3, [sp]
+ldaxrb     w8, [x4]
+ldaxrh     w7, [x5]
+ldaxr      w6, [sp]
+ldaxr      x5, [x6]
+ldaxr      x5, [x6]
+ldaxr      x5, [x6]
+stlxp      w4, w5, w6, [sp]
+stlxp      wzr, x6, x7, [x1]
+ldaxp      w5, w18, [sp]
+ldaxp      x6, x19, [x22]
+ldaxp      x6, x19, [x22]
+stlrb      w24, [sp]
+stlrh      w25, [x30]
+stlr       w26, [x29]
+stlr       x27, [x28]
+stlr       x27, [x28]
+stlr       x27, [x28]
+ldarb      w23, [sp]
+ldarh      w22, [x30]
+ldar       wzr, [x29]
+ldar       x21, [x28]
+ldar       x21, [x28]
+ldar       x21, [x28]
+
+#------------------------------------------------------------------------------
+# Load/store (unscaled  immediate)
+#------------------------------------------------------------------------------
+
+sturb    w9, [sp]
+sturh    wzr, [x12, #255]
+stur     w16, [x0, #-256]
+stur     x28, [x14, #1]
+ldurb    w1, [x20, #255]
+ldurh    w20, [x1, #255]
+ldur     w12, [sp, #255]
+ldur     xzr, [x12, #255]
+ldursb   x9, [x7, #-256]
+ldursh   x17, [x19, #-256]
+ldursw   x20, [x15, #-256]
+prfum    pldl2keep, [sp, #-256]
+ldursb   w19, [x1, #-256]
+ldursh   w15, [x21, #-256]
+stur     b0, [sp, #1]
+stur     h12, [x12, #-1]
+stur     s15, [x0, #255]
+stur     d31, [x5, #25]
+stur     q9, [x5]
+ldur     b3, [sp]
+ldur     h5, [x4, #-256]
+ldur     s7, [x12, #-1]
+ldur     d11, [x19, #4]
+ldur     q13, [x1, #2]
+
+#------------------------------------------------------------------------------
+# Load/store (immediate post-indexed)
+#------------------------------------------------------------------------------
+
+strb     w9, [x2], #255
+strb     w10, [x3], #1
+strb     w10, [x3], #-256
+strh     w9, [x2], #255
+strh     w9, [x2], #1
+strh     w10, [x3], #-256
+str      w19, [sp], #255
+str      w20, [x30], #1
+str      w21, [x12], #-256
+str      xzr, [x9], #255
+str      x2, [x3], #1
+str      x19, [x12], #-256
+ldrb     w9, [x2], #255
+ldrb     w10, [x3], #1
+ldrb     w10, [x3], #-256
+ldrh     w9, [x2], #255
+ldrh     w9, [x2], #1
+ldrh     w10, [x3], #-256
+ldr      w19, [sp], #255
+ldr      w20, [x30], #1
+ldr      w21, [x12], #-256
+ldr      xzr, [x9], #255
+ldr      x2, [x3], #1
+ldr      x19, [x12], #-256
+ldrsb    xzr, [x9], #255
+ldrsb    x2, [x3], #1
+ldrsb    x19, [x12], #-256
+ldrsh    xzr, [x9], #255
+ldrsh    x2, [x3], #1
+ldrsh    x19, [x12], #-256
+ldrsw    xzr, [x9], #255
+ldrsw    x2, [x3], #1
+ldrsw    x19, [x12], #-256
+ldrsb    wzr, [x9], #255
+ldrsb    w2, [x3], #1
+ldrsb    w19, [x12], #-256
+ldrsh    wzr, [x9], #255
+ldrsh    w2, [x3], #1
+ldrsh    w19, [x12], #-256
+str      b0, [x0], #255
+str      b3, [x3], #1
+str      b5, [sp], #-256
+str      h10, [x10], #255
+str      h13, [x23], #1
+str      h15, [sp], #-256
+str      s20, [x20], #255
+str      s23, [x23], #1
+str      s25, [x0], #-256
+str      d20, [x20], #255
+str      d23, [x23], #1
+str      d25, [x0], #-256
+ldr      b0, [x0], #255
+ldr      b3, [x3], #1
+ldr      b5, [sp], #-256
+ldr      h10, [x10], #255
+ldr      h13, [x23], #1
+ldr      h15, [sp], #-256
+ldr      s20, [x20], #255
+ldr      s23, [x23], #1
+ldr      s25, [x0], #-256
+ldr      d20, [x20], #255
+ldr      d23, [x23], #1
+ldr      d25, [x0], #-256
+ldr      q20, [x1], #255
+ldr      q23, [x9], #1
+ldr      q25, [x20], #-256
+str      q10, [x1], #255
+str      q22, [sp], #1
+str      q21, [x20], #-256
+
+#-------------------------------------------------------------------------------
+# Load-store register (immediate pre-indexed)
+#-------------------------------------------------------------------------------
+
+ldr      x3, [x4, #0]!
+strb     w9, [x2, #255]!
+strb     w10, [x3, #1]!
+strb     w10, [x3, #-256]!
+strh     w9, [x2, #255]!
+strh     w9, [x2, #1]!
+strh     w10, [x3, #-256]!
+str      w19, [sp, #255]!
+str      w20, [x30, #1]!
+str      w21, [x12, #-256]!
+str      xzr, [x9, #255]!
+str      x2, [x3, #1]!
+str      x19, [x12, #-256]!
+ldrb     w9, [x2, #255]!
+ldrb     w10, [x3, #1]!
+ldrb     w10, [x3, #-256]!
+ldrh     w9, [x2, #255]!
+ldrh     w9, [x2, #1]!
+ldrh     w10, [x3, #-256]!
+ldr      w19, [sp, #255]!
+ldr      w20, [x30, #1]!
+ldr      w21, [x12, #-256]!
+ldr      xzr, [x9, #255]!
+ldr      x2, [x3, #1]!
+ldr      x19, [x12, #-256]!
+ldrsb    xzr, [x9, #255]!
+ldrsb    x2, [x3, #1]!
+ldrsb    x19, [x12, #-256]!
+ldrsh    xzr, [x9, #255]!
+ldrsh    x2, [x3, #1]!
+ldrsh    x19, [x12, #-256]!
+ldrsw    xzr, [x9, #255]!
+ldrsw    x2, [x3, #1]!
+ldrsw    x19, [x12, #-256]!
+ldrsb    wzr, [x9, #255]!
+ldrsb    w2, [x3, #1]!
+ldrsb    w19, [x12, #-256]!
+ldrsh    wzr, [x9, #255]!
+ldrsh    w2, [x3, #1]!
+ldrsh    w19, [x12, #-256]!
+str      b0, [x0, #255]!
+str      b3, [x3, #1]!
+str      b5, [sp, #-256]!
+str      h10, [x10, #255]!
+str      h13, [x23, #1]!
+str      h15, [sp, #-256]!
+str      s20, [x20, #255]!
+str      s23, [x23, #1]!
+str      s25, [x0, #-256]!
+str      d20, [x20, #255]!
+str      d23, [x23, #1]!
+str      d25, [x0, #-256]!
+ldr      b0, [x0, #255]!
+ldr      b3, [x3, #1]!
+ldr      b5, [sp, #-256]!
+ldr      h10, [x10, #255]!
+ldr      h13, [x23, #1]!
+ldr      h15, [sp, #-256]!
+ldr      s20, [x20, #255]!
+ldr      s23, [x23, #1]!
+ldr      s25, [x0, #-256]!
+ldr      d20, [x20, #255]!
+ldr      d23, [x23, #1]!
+ldr      d25, [x0, #-256]!
+ldr      q20, [x1, #255]!
+ldr      q23, [x9, #1]!
+ldr      q25, [x20, #-256]!
+str      q10, [x1, #255]!
+str      q22, [sp, #1]!
+str      q21, [x20, #-256]!
+
+#------------------------------------------------------------------------------
+# Load/store (unprivileged)
+#------------------------------------------------------------------------------
+
+sttrb    w9, [sp]
+sttrh    wzr, [x12, #255]
+sttr     w16, [x0, #-256]
+sttr     x28, [x14, #1]
+ldtrb    w1, [x20, #255]
+ldtrh    w20, [x1, #255]
+ldtr     w12, [sp, #255]
+ldtr     xzr, [x12, #255]
+ldtrsb   x9, [x7, #-256]
+ldtrsh   x17, [x19, #-256]
+ldtrsw   x20, [x15, #-256]
+ldtrsb   w19, [x1, #-256]
+ldtrsh   w15, [x21, #-256]
+
+#------------------------------------------------------------------------------
+# Load/store (unsigned  immediate)
+#------------------------------------------------------------------------------
+
+ldr      x4, [x29]
+ldr      x30, [x12, #32760]
+ldr      x20, [sp, #8]
+ldr      xzr, [sp]
+ldr      w2, [sp]
+ldr      w17, [sp, #16380]
+ldr      w13, [x2, #4]
+ldrsw    x2, [x5, #4]
+ldrsw    x23, [sp, #16380]
+ldrh     w2, [x4]
+ldrsh    w23, [x6, #8190]
+ldrsh    wzr, [sp, #2]
+ldrsh    x29, [x2, #2]
+ldrb     w26, [x3, #121]
+ldrb     w12, [x2]
+ldrsb    w27, [sp, #4095]
+ldrsb    xzr, [x15]
+str      x30, [sp]
+str      w20, [x4, #16380]
+strh     w17, [sp, #8190]
+strb     w23, [x3, #4095]
+strb     wzr, [x2]
+ldr      b31, [sp, #4095]
+ldr      h20, [x2, #8190]
+ldr      s10, [x19, #16380]
+ldr      d3, [x10, #32760]
+str      q12, [sp, #65520]
+
+#------------------------------------------------------------------------------
+# Load/store (register offset)
+#------------------------------------------------------------------------------
+
+ldrb     w3, [sp, x5]
+ldrb     w9, [x27, x6]
+ldrsb    w10, [x30, x7]
+ldrb     w11, [x29, x3, sxtx]
+strb     w12, [x28, xzr, sxtx]
+ldrb     w14, [x26, w6, uxtw]
+ldrsb    w15, [x25, w7, uxtw]
+ldrb     w17, [x23, w9, sxtw]
+ldrsb    x18, [x22, w10, sxtw]
+ldrsh    w3, [sp, x5]
+ldrsh    w9, [x27, x6]
+ldrh     w10, [x30, x7, lsl #1]
+strh     w11, [x29, x3, sxtx]
+ldrh     w12, [x28, xzr, sxtx]
+ldrsh    x13, [x27, x5, sxtx #1]
+ldrh     w14, [x26, w6, uxtw]
+ldrh     w15, [x25, w7, uxtw]
+ldrsh    w16, [x24, w8, uxtw #1]
+ldrh     w17, [x23, w9, sxtw]
+ldrh     w18, [x22, w10, sxtw]
+strh     w19, [x21, wzr, sxtw #1]
+ldr      w3, [sp, x5]
+ldr      s9, [x27, x6]
+ldr      w10, [x30, x7, lsl #2]
+ldr      w11, [x29, x3, sxtx]
+str      s12, [x28, xzr, sxtx]
+str      w13, [x27, x5, sxtx #2]
+str      w14, [x26, w6, uxtw]
+ldr      w15, [x25, w7, uxtw]
+ldr      w16, [x24, w8, uxtw #2]
+ldrsw    x17, [x23, w9, sxtw]
+ldr      w18, [x22, w10, sxtw]
+ldrsw    x19, [x21, wzr, sxtw #2]
+ldr      x3, [sp, x5]
+str      x9, [x27, x6]
+ldr      d10, [x30, x7, lsl #3]
+str      x11, [x29, x3, sxtx]
+ldr      x12, [x28, xzr, sxtx]
+ldr      x13, [x27, x5, sxtx #3]
+prfm     pldl1keep, [x26, w6, uxtw]
+ldr      x15, [x25, w7, uxtw]
+ldr      x16, [x24, w8, uxtw #3]
+ldr      x17, [x23, w9, sxtw]
+ldr      x18, [x22, w10, sxtw]
+str      d19, [x21, wzr, sxtw #3]
+ldr      q3, [sp, x5]
+ldr      q9, [x27, x6]
+ldr      q10, [x30, x7, lsl #4]
+str      q11, [x29, x3, sxtx]
+str      q12, [x28, xzr, sxtx]
+str      q13, [x27, x5, sxtx #4]
+ldr      q14, [x26, w6, uxtw]
+ldr      q15, [x25, w7, uxtw]
+ldr      q16, [x24, w8, uxtw #4]
+ldr      q17, [x23, w9, sxtw]
+str      q18, [x22, w10, sxtw]
+ldr      q19, [x21, wzr, sxtw #4]
+
+#------------------------------------------------------------------------------
+# Load/store register pair (offset)
+#------------------------------------------------------------------------------
+
+ldp      w3, w5, [sp]
+stp      wzr, w9, [sp, #252]
+ldp      w2, wzr, [sp, #-256]
+ldp      w9, w10, [sp, #4]
+ldpsw    x9, x10, [sp, #4]
+ldpsw    x9, x10, [x2, #-256]
+ldpsw    x20, x30, [sp, #252]
+ldp      x21, x29, [x2, #504]
+ldp      x22, x23, [x3, #-512]
+ldp      x24, x25, [x4, #8]
+ldp      s29, s28, [sp, #252]
+stp      s27, s26, [sp, #-256]
+ldp      s1, s2, [x3, #44]
+stp      d3, d5, [x9, #504]
+stp      d7, d11, [x10, #-512]
+ldp      d2, d3, [x30, #-8]
+stp      q3, q5, [sp]
+stp      q17, q19, [sp, #1008]
+ldp      q23, q29, [x1, #-1024]
+
+#------------------------------------------------------------------------------
+# Load/store register pair (post-indexed)
+#------------------------------------------------------------------------------
+
+ldp      w3, w5, [sp], #0
+stp      wzr, w9, [sp], #252
+ldp      w2, wzr, [sp], #-256
+ldp      w9, w10, [sp], #4
+ldpsw    x9, x10, [sp], #4
+ldpsw    x9, x10, [x2], #-256
+ldpsw    x20, x30, [sp], #252
+ldp      x21, x29, [x2], #504
+ldp      x22, x23, [x3], #-512
+ldp      x24, x25, [x4], #8
+ldp      s29, s28, [sp], #252
+stp      s27, s26, [sp], #-256
+ldp      s1, s2, [x3], #44
+stp      d3, d5, [x9], #504
+stp      d7, d11, [x10], #-512
+ldp      d2, d3, [x30], #-8
+stp      q3, q5, [sp], #0
+stp      q17, q19, [sp], #1008
+ldp      q23, q29, [x1], #-1024
+
+#------------------------------------------------------------------------------
+# Load/store register pair (pre-indexed)
+#------------------------------------------------------------------------------
+
+ldp      w3, w5, [sp, #0]!
+stp      wzr, w9, [sp, #252]!
+ldp      w2, wzr, [sp, #-256]!
+ldp      w9, w10, [sp, #4]!
+ldpsw    x9, x10, [sp, #4]!
+ldpsw    x9, x10, [x2, #-256]!
+ldpsw    x20, x30, [sp, #252]!
+ldp      x21, x29, [x2, #504]!
+ldp      x22, x23, [x3, #-512]!
+ldp      x24, x25, [x4, #8]!
+ldp      s29, s28, [sp, #252]!
+stp      s27, s26, [sp, #-256]!
+ldp      s1, s2, [x3, #44]!
+stp      d3, d5, [x9, #504]!
+stp      d7, d11, [x10, #-512]!
+ldp      d2, d3, [x30, #-8]!
+stp      q3, q5, [sp, #0]!
+stp      q17, q19, [sp, #1008]!
+ldp      q23, q29, [x1, #-1024]!
+
+#------------------------------------------------------------------------------
+# Load/store register pair (offset)
+#------------------------------------------------------------------------------
+
+ldnp      w3, w5, [sp]
+stnp      wzr, w9, [sp, #252]
+ldnp      w2, wzr, [sp, #-256]
+ldnp      w9, w10, [sp, #4]
+ldnp      x21, x29, [x2, #504]
+ldnp      x22, x23, [x3, #-512]
+ldnp      x24, x25, [x4, #8]
+ldnp      s29, s28, [sp, #252]
+stnp      s27, s26, [sp, #-256]
+ldnp      s1, s2, [x3, #44]
+stnp      d3, d5, [x9, #504]
+stnp      d7, d11, [x10, #-512]
+ldnp      d2, d3, [x30, #-8]
+stnp      q3, q5, [sp]
+stnp      q17, q19, [sp, #1008]
+ldnp      q23, q29, [x1, #-1024]
+
+#------------------------------------------------------------------------------
+# Logical (immediate)
+#------------------------------------------------------------------------------
+
+mov      w3, #983055
+mov      x10, #-6148914691236517206
+
+#------------------------------------------------------------------------------
+# Logical (shifted register)
+#------------------------------------------------------------------------------
+
+and      w12, w23, w21
+and      w16, w15, w1, lsl #1
+and      w9, w4, w10, lsl #31
+and      w3, w30, w11
+and      x3, x5, x7, lsl #63
+and      x5, x14, x19, asr #4
+and      w3, w17, w19, ror #31
+and      w0, w2, wzr, lsr #17
+and      w3, w30, w11, asr #2
+and      xzr, x4, x26
+and      w3, wzr, w20, ror #2
+and      x7, x20, xzr, asr #63
+bic      x13, x20, x14, lsl #47
+bic      w2, w7, w9
+orr      w2, w7, w0, asr #31
+orr      x8, x9, x10, lsl #12
+orn      x3, x5, x7, asr #2
+orn      w2, w5, w29
+ands     w7, wzr, w9, lsl #1
+ands     x3, x5, x20, ror #63
+bics     w3, w5, w7
+bics     x3, xzr, x3, lsl #1
+tst      w3, w7, lsl #31
+tst      x2, x20, asr #2
+mov      x3, x6
+mov      x3, xzr
+mov      wzr, w2
+mov      w3, w5
+
+#------------------------------------------------------------------------------
+# Move wide (immediate)
+#------------------------------------------------------------------------------
+
+movz     w2, #0, lsl #16
+mov     w2, #-1235
+mov     x2, #5299989643264
+mov      x2, #0
+movk     w3, #0
+movz     x4, #0, lsl #16
+movk     w5, #0, lsl #16
+movz     x6, #0, lsl #32
+movk     x7, #0, lsl #32
+movz     x8, #0, lsl #48
+movk     x9, #0, lsl #48
+
+#------------------------------------------------------------------------------
+# PC-relative addressing
+#------------------------------------------------------------------------------
+
+adr      x2, #1600
+adrp     x21, #6553600
+adr      x0, #262144
+
+#------------------------------------------------------------------------------
+# Test and branch (immediate)
+#------------------------------------------------------------------------------
+
+tbz     x12, #62, #0
+tbz     x12, #62, #4
+tbz     x12, #62, #-32768
+tbnz    x12, #60, #32764
+
+#------------------------------------------------------------------------------
+# Unconditional branch (immediate)
+#------------------------------------------------------------------------------
+
+b        #4
+b        #-4
+b        #134217724
+
+#------------------------------------------------------------------------------
+# Unconditional branch (register)
+#------------------------------------------------------------------------------
+
+br       x20
+blr      xzr
+ret      x10
+ret
+eret
+drps
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.25                        add	w2, w3, #4095
+# CHECK-NEXT:  1      1     0.25                        add	w30, w29, #1, lsl #12
+# CHECK-NEXT:  1      1     0.25                        add	w13, w5, #4095, lsl #12
+# CHECK-NEXT:  1      1     0.25                        add	x5, x7, #1638
+# CHECK-NEXT:  1      1     0.25                        add	w20, wsp, #801
+# CHECK-NEXT:  1      1     0.25                        add	wsp, wsp, #1104
+# CHECK-NEXT:  1      1     0.25                        add	wsp, w30, #4084
+# CHECK-NEXT:  1      1     0.25                        add	x0, x24, #291
+# CHECK-NEXT:  1      1     0.25                        add	x3, x24, #4095, lsl #12
+# CHECK-NEXT:  1      1     0.25                        add	x8, sp, #1074
+# CHECK-NEXT:  1      1     0.25                        add	sp, x29, #3816
+# CHECK-NEXT:  1      1     0.25                        sub	w0, wsp, #4077
+# CHECK-NEXT:  1      1     0.25                        sub	w4, w20, #546, lsl #12
+# CHECK-NEXT:  1      1     0.25                        sub	sp, sp, #288
+# CHECK-NEXT:  1      1     0.25                        sub	wsp, w19, #16
+# CHECK-NEXT:  1      1     0.50                        adds	w13, w23, #291, lsl #12
+# CHECK-NEXT:  1      1     0.50                        cmn	w2, #4095
+# CHECK-NEXT:  1      1     0.50                        adds	w20, wsp, #0
+# CHECK-NEXT:  1      1     0.50                        cmn	x3, #1, lsl #12
+# CHECK-NEXT:  1      1     0.50                        cmp	sp, #20, lsl #12
+# CHECK-NEXT:  1      1     0.50                        cmp	x30, #4095
+# CHECK-NEXT:  1      1     0.50                        subs	x4, sp, #3822
+# CHECK-NEXT:  1      1     0.50                        cmn	w3, #291, lsl #12
+# CHECK-NEXT:  1      1     0.50                        cmn	wsp, #1365
+# CHECK-NEXT:  1      1     0.50                        cmn	sp, #1092, lsl #12
+# CHECK-NEXT:  1      1     0.25                        mov	sp, x30
+# CHECK-NEXT:  1      1     0.25                        mov	wsp, w20
+# CHECK-NEXT:  1      1     0.25                        mov	x11, sp
+# CHECK-NEXT:  1      1     0.25                        mov	w24, wsp
+# CHECK-NEXT:  1      1     0.25                        add	w3, w5, w7
+# CHECK-NEXT:  1      1     0.25                        add	wzr, w3, w5
+# CHECK-NEXT:  1      1     0.25                        add	w20, wzr, w4
+# CHECK-NEXT:  1      1     0.25                        add	w4, w6, wzr
+# CHECK-NEXT:  1      1     0.25                        add	w11, w13, w15
+# CHECK-NEXT:  2      2     0.50                        add	w9, w3, wzr, lsl #10
+# CHECK-NEXT:  2      2     0.50                        add	w17, w29, w20, lsl #31
+# CHECK-NEXT:  2      2     0.50                        add	w21, w22, w23, lsr #0
+# CHECK-NEXT:  2      2     0.50                        add	w24, w25, w26, lsr #18
+# CHECK-NEXT:  2      2     0.50                        add	w27, w28, w29, lsr #31
+# CHECK-NEXT:  2      2     0.50                        add	w2, w3, w4, asr #0
+# CHECK-NEXT:  2      2     0.50                        add	w5, w6, w7, asr #21
+# CHECK-NEXT:  2      2     0.50                        add	w8, w9, w10, asr #31
+# CHECK-NEXT:  1      1     0.25                        add	x3, x5, x7
+# CHECK-NEXT:  1      1     0.25                        add	xzr, x3, x5
+# CHECK-NEXT:  1      1     0.25                        add	x20, xzr, x4
+# CHECK-NEXT:  1      1     0.25                        add	x4, x6, xzr
+# CHECK-NEXT:  1      1     0.25                        add	x11, x13, x15
+# CHECK-NEXT:  2      2     0.50                        add	x9, x3, xzr, lsl #10
+# CHECK-NEXT:  2      2     0.50                        add	x17, x29, x20, lsl #63
+# CHECK-NEXT:  2      2     0.50                        add	x21, x22, x23, lsr #0
+# CHECK-NEXT:  2      2     0.50                        add	x24, x25, x26, lsr #18
+# CHECK-NEXT:  2      2     0.50                        add	x27, x28, x29, lsr #63
+# CHECK-NEXT:  2      2     0.50                        add	x2, x3, x4, asr #0
+# CHECK-NEXT:  2      2     0.50                        add	x5, x6, x7, asr #21
+# CHECK-NEXT:  2      2     0.50                        add	x8, x9, x10, asr #63
+# CHECK-NEXT:  1      1     0.25                        adds	w3, w5, w7
+# CHECK-NEXT:  1      1     0.25                        cmn	w3, w5
+# CHECK-NEXT:  1      1     0.25                        adds	w20, wzr, w4
+# CHECK-NEXT:  1      1     0.25                        adds	w4, w6, wzr
+# CHECK-NEXT:  1      1     0.25                        adds	w11, w13, w15
+# CHECK-NEXT:  2      2     0.50                        adds	w9, w3, wzr, lsl #10
+# CHECK-NEXT:  2      2     0.50                        adds	w17, w29, w20, lsl #31
+# CHECK-NEXT:  2      2     0.50                        adds	w21, w22, w23, lsr #0
+# CHECK-NEXT:  2      2     0.50                        adds	w24, w25, w26, lsr #18
+# CHECK-NEXT:  2      2     0.50                        adds	w27, w28, w29, lsr #31
+# CHECK-NEXT:  2      2     0.50                        adds	w2, w3, w4, asr #0
+# CHECK-NEXT:  2      2     0.50                        adds	w5, w6, w7, asr #21
+# CHECK-NEXT:  2      2     0.50                        adds	w8, w9, w10, asr #31
+# CHECK-NEXT:  1      1     0.25                        adds	x3, x5, x7
+# CHECK-NEXT:  1      1     0.25                        cmn	x3, x5
+# CHECK-NEXT:  1      1     0.25                        adds	x20, xzr, x4
+# CHECK-NEXT:  1      1     0.25                        adds	x4, x6, xzr
+# CHECK-NEXT:  1      1     0.25                        adds	x11, x13, x15
+# CHECK-NEXT:  2      2     0.50                        adds	x9, x3, xzr, lsl #10
+# CHECK-NEXT:  2      2     0.50                        adds	x17, x29, x20, lsl #63
+# CHECK-NEXT:  2      2     0.50                        adds	x21, x22, x23, lsr #0
+# CHECK-NEXT:  2      2     0.50                        adds	x24, x25, x26, lsr #18
+# CHECK-NEXT:  2      2     0.50                        adds	x27, x28, x29, lsr #63
+# CHECK-NEXT:  2      2     0.50                        adds	x2, x3, x4, asr #0
+# CHECK-NEXT:  2      2     0.50                        adds	x5, x6, x7, asr #21
+# CHECK-NEXT:  2      2     0.50                        adds	x8, x9, x10, asr #63
+# CHECK-NEXT:  1      1     0.25                        sub	w3, w5, w7
+# CHECK-NEXT:  1      1     0.25                        sub	wzr, w3, w5
+# CHECK-NEXT:  1      1     0.25                        sub	w4, w6, wzr
+# CHECK-NEXT:  1      1     0.25                        sub	w11, w13, w15
+# CHECK-NEXT:  2      2     0.50                        sub	w9, w3, wzr, lsl #10
+# CHECK-NEXT:  2      2     0.50                        sub	w17, w29, w20, lsl #31
+# CHECK-NEXT:  2      2     0.50                        sub	w21, w22, w23, lsr #0
+# CHECK-NEXT:  2      2     0.50                        sub	w24, w25, w26, lsr #18
+# CHECK-NEXT:  2      2     0.50                        sub	w27, w28, w29, lsr #31
+# CHECK-NEXT:  2      2     0.50                        sub	w2, w3, w4, asr #0
+# CHECK-NEXT:  2      2     0.50                        sub	w5, w6, w7, asr #21
+# CHECK-NEXT:  2      2     0.50                        sub	w8, w9, w10, asr #31
+# CHECK-NEXT:  1      1     0.25                        sub	x3, x5, x7
+# CHECK-NEXT:  1      1     0.25                        sub	xzr, x3, x5
+# CHECK-NEXT:  1      1     0.25                        sub	x4, x6, xzr
+# CHECK-NEXT:  1      1     0.25                        sub	x11, x13, x15
+# CHECK-NEXT:  2      2     0.50                        sub	x9, x3, xzr, lsl #10
+# CHECK-NEXT:  2      2     0.50                        sub	x17, x29, x20, lsl #63
+# CHECK-NEXT:  2      2     0.50                        sub	x21, x22, x23, lsr #0
+# CHECK-NEXT:  2      2     0.50                        sub	x24, x25, x26, lsr #18
+# CHECK-NEXT:  2      2     0.50                        sub	x27, x28, x29, lsr #63
+# CHECK-NEXT:  2      2     0.50                        sub	x2, x3, x4, asr #0
+# CHECK-NEXT:  2      2     0.50                        sub	x5, x6, x7, asr #21
+# CHECK-NEXT:  2      2     0.50                        sub	x8, x9, x10, asr #63
+# CHECK-NEXT:  1      1     0.25                        subs	w3, w5, w7
+# CHECK-NEXT:  1      1     0.25                        cmp	w3, w5
+# CHECK-NEXT:  1      1     0.25                        subs	w4, w6, wzr
+# CHECK-NEXT:  1      1     0.25                        subs	w11, w13, w15
+# CHECK-NEXT:  2      2     0.50                        subs	w9, w3, wzr, lsl #10
+# CHECK-NEXT:  2      2     0.50                        subs	w17, w29, w20, lsl #31
+# CHECK-NEXT:  2      2     0.50                        subs	w21, w22, w23, lsr #0
+# CHECK-NEXT:  2      2     0.50                        subs	w24, w25, w26, lsr #18
+# CHECK-NEXT:  2      2     0.50                        subs	w27, w28, w29, lsr #31
+# CHECK-NEXT:  2      2     0.50                        subs	w2, w3, w4, asr #0
+# CHECK-NEXT:  2      2     0.50                        subs	w5, w6, w7, asr #21
+# CHECK-NEXT:  2      2     0.50                        subs	w8, w9, w10, asr #31
+# CHECK-NEXT:  1      1     0.25                        subs	x3, x5, x7
+# CHECK-NEXT:  1      1     0.25                        cmp	x3, x5
+# CHECK-NEXT:  1      1     0.25                        subs	x4, x6, xzr
+# CHECK-NEXT:  1      1     0.25                        subs	x11, x13, x15
+# CHECK-NEXT:  2      2     0.50                        subs	x9, x3, xzr, lsl #10
+# CHECK-NEXT:  2      2     0.50                        subs	x17, x29, x20, lsl #63
+# CHECK-NEXT:  2      2     0.50                        subs	x21, x22, x23, lsr #0
+# CHECK-NEXT:  2      2     0.50                        subs	x24, x25, x26, lsr #18
+# CHECK-NEXT:  2      2     0.50                        subs	x27, x28, x29, lsr #63
+# CHECK-NEXT:  2      2     0.50                        subs	x2, x3, x4, asr #0
+# CHECK-NEXT:  2      2     0.50                        subs	x5, x6, x7, asr #21
+# CHECK-NEXT:  2      2     0.50                        subs	x8, x9, x10, asr #63
+# CHECK-NEXT:  1      1     0.25                        cmn	wzr, w4
+# CHECK-NEXT:  1      1     0.25                        cmn	w5, wzr
+# CHECK-NEXT:  1      1     0.25                        cmn	w6, w7
+# CHECK-NEXT:  2      2     0.50                        cmn	w8, w9, lsl #15
+# CHECK-NEXT:  2      2     0.50                        cmn	w10, w11, lsl #31
+# CHECK-NEXT:  2      2     0.50                        cmn	w12, w13, lsr #0
+# CHECK-NEXT:  2      2     0.50                        cmn	w14, w15, lsr #21
+# CHECK-NEXT:  2      2     0.50                        cmn	w16, w17, lsr #31
+# CHECK-NEXT:  2      2     0.50                        cmn	w18, w19, asr #0
+# CHECK-NEXT:  2      2     0.50                        cmn	w20, w21, asr #22
+# CHECK-NEXT:  2      2     0.50                        cmn	w22, w23, asr #31
+# CHECK-NEXT:  1      1     0.25                        cmn	x0, x3
+# CHECK-NEXT:  1      1     0.25                        cmn	xzr, x4
+# CHECK-NEXT:  1      1     0.25                        cmn	x5, xzr
+# CHECK-NEXT:  1      1     0.25                        cmn	x6, x7
+# CHECK-NEXT:  2      2     0.50                        cmn	x8, x9, lsl #15
+# CHECK-NEXT:  2      2     0.50                        cmn	x10, x11, lsl #63
+# CHECK-NEXT:  2      2     0.50                        cmn	x12, x13, lsr #0
+# CHECK-NEXT:  2      2     0.50                        cmn	x14, x15, lsr #41
+# CHECK-NEXT:  2      2     0.50                        cmn	x16, x17, lsr #63
+# CHECK-NEXT:  2      2     0.50                        cmn	x18, x19, asr #0
+# CHECK-NEXT:  2      2     0.50                        cmn	x20, x21, asr #55
+# CHECK-NEXT:  2      2     0.50                        cmn	x22, x23, asr #63
+# CHECK-NEXT:  1      1     0.25                        cmp	w0, w3
+# CHECK-NEXT:  1      1     0.25                        cmp	wzr, w4
+# CHECK-NEXT:  1      1     0.25                        cmp	w5, wzr
+# CHECK-NEXT:  1      1     0.25                        cmp	w6, w7
+# CHECK-NEXT:  2      2     0.50                        cmp	w8, w9, lsl #15
+# CHECK-NEXT:  2      2     0.50                        cmp	w10, w11, lsl #31
+# CHECK-NEXT:  2      2     0.50                        cmp	w12, w13, lsr #0
+# CHECK-NEXT:  2      2     0.50                        cmp	w14, w15, lsr #21
+# CHECK-NEXT:  2      2     0.50                        cmp	w18, w19, asr #0
+# CHECK-NEXT:  2      2     0.50                        cmp	w20, w21, asr #22
+# CHECK-NEXT:  2      2     0.50                        cmp	w22, w23, asr #31
+# CHECK-NEXT:  1      1     0.25                        cmp	x0, x3
+# CHECK-NEXT:  1      1     0.25                        cmp	xzr, x4
+# CHECK-NEXT:  1      1     0.25                        cmp	x5, xzr
+# CHECK-NEXT:  1      1     0.25                        cmp	x6, x7
+# CHECK-NEXT:  2      2     0.50                        cmp	x8, x9, lsl #15
+# CHECK-NEXT:  2      2     0.50                        cmp	x10, x11, lsl #63
+# CHECK-NEXT:  2      2     0.50                        cmp	x12, x13, lsr #0
+# CHECK-NEXT:  2      2     0.50                        cmp	x14, x15, lsr #41
+# CHECK-NEXT:  2      2     0.50                        cmp	x16, x17, lsr #63
+# CHECK-NEXT:  2      2     0.50                        cmp	x18, x19, asr #0
+# CHECK-NEXT:  2      2     0.50                        cmp	x20, x21, asr #55
+# CHECK-NEXT:  2      2     0.50                        cmp	x22, x23, asr #63
+# CHECK-NEXT:  1      1     0.25                        cmp	wzr, w0
+# CHECK-NEXT:  1      1     0.25                        cmp	xzr, x0
+# CHECK-NEXT:  1      1     0.50                        adc	w29, w27, w25
+# CHECK-NEXT:  1      1     0.50                        adc	wzr, w3, w4
+# CHECK-NEXT:  1      1     0.50                        adc	w9, wzr, w10
+# CHECK-NEXT:  1      1     0.50                        adc	w20, w0, wzr
+# CHECK-NEXT:  1      1     0.50                        adc	x29, x27, x25
+# CHECK-NEXT:  1      1     0.50                        adc	xzr, x3, x4
+# CHECK-NEXT:  1      1     0.50                        adc	x9, xzr, x10
+# CHECK-NEXT:  1      1     0.50                        adc	x20, x0, xzr
+# CHECK-NEXT:  1      1     0.50                        adcs	w29, w27, w25
+# CHECK-NEXT:  1      1     0.50                        adcs	wzr, w3, w4
+# CHECK-NEXT:  1      1     0.50                        adcs	w9, wzr, w10
+# CHECK-NEXT:  1      1     0.50                        adcs	w20, w0, wzr
+# CHECK-NEXT:  1      1     0.50                        adcs	x29, x27, x25
+# CHECK-NEXT:  1      1     0.50                        adcs	xzr, x3, x4
+# CHECK-NEXT:  1      1     0.50                        adcs	x9, xzr, x10
+# CHECK-NEXT:  1      1     0.50                        adcs	x20, x0, xzr
+# CHECK-NEXT:  1      1     0.50                        sbc	w29, w27, w25
+# CHECK-NEXT:  1      1     0.50                        sbc	wzr, w3, w4
+# CHECK-NEXT:  1      1     0.50                        ngc	w9, w10
+# CHECK-NEXT:  1      1     0.50                        sbc	w20, w0, wzr
+# CHECK-NEXT:  1      1     0.50                        sbc	x29, x27, x25
+# CHECK-NEXT:  1      1     0.50                        sbc	xzr, x3, x4
+# CHECK-NEXT:  1      1     0.50                        ngc	x9, x10
+# CHECK-NEXT:  1      1     0.50                        sbc	x20, x0, xzr
+# CHECK-NEXT:  1      1     0.50                        sbcs	w29, w27, w25
+# CHECK-NEXT:  1      1     0.50                        sbcs	wzr, w3, w4
+# CHECK-NEXT:  1      1     0.50                        ngcs	w9, w10
+# CHECK-NEXT:  1      1     0.50                        sbcs	w20, w0, wzr
+# CHECK-NEXT:  1      1     0.50                        sbcs	x29, x27, x25
+# CHECK-NEXT:  1      1     0.50                        sbcs	xzr, x3, x4
+# CHECK-NEXT:  1      1     0.50                        ngcs	x9, x10
+# CHECK-NEXT:  1      1     0.50                        sbcs	x20, x0, xzr
+# CHECK-NEXT:  1      1     0.50                        ngc	w3, w12
+# CHECK-NEXT:  1      1     0.50                        ngc	wzr, w9
+# CHECK-NEXT:  1      1     0.50                        ngc	w23, wzr
+# CHECK-NEXT:  1      1     0.50                        ngc	x29, x30
+# CHECK-NEXT:  1      1     0.50                        ngc	xzr, x0
+# CHECK-NEXT:  1      1     0.50                        ngc	x0, xzr
+# CHECK-NEXT:  1      1     0.50                        ngcs	w3, w12
+# CHECK-NEXT:  1      1     0.50                        ngcs	wzr, w9
+# CHECK-NEXT:  1      1     0.50                        ngcs	w23, wzr
+# CHECK-NEXT:  1      1     0.50                        ngcs	x29, x30
+# CHECK-NEXT:  1      1     0.50                        ngcs	xzr, x0
+# CHECK-NEXT:  1      1     0.50                        ngcs	x0, xzr
+# CHECK-NEXT:  1      1     0.50                        sbfx	x1, x2, #3, #2
+# CHECK-NEXT:  1      1     0.50                        asr	x3, x4, #63
+# CHECK-NEXT:  1      1     0.50                        asr	wzr, wzr, #31
+# CHECK-NEXT:  1      1     0.50                        sbfx	w12, w9, #0, #1
+# CHECK-NEXT:  1      1     0.50                        ubfiz	x4, x5, #52, #11
+# CHECK-NEXT:  1      1     0.50                        ubfx	xzr, x4, #0, #1
+# CHECK-NEXT:  1      1     0.50                        ubfiz	x4, xzr, #1, #6
+# CHECK-NEXT:  1      1     0.50                        lsr	x5, x6, #12
+# CHECK-NEXT:  1      1     0.50                        bfi	x4, x5, #52, #11
+# CHECK-NEXT:  1      1     0.50                        bfxil	xzr, x4, #0, #1
+# CHECK-NEXT:  1      1     0.50                        bfc	x4, #1, #6
+# CHECK-NEXT:  1      1     0.50                        bfxil	x5, x6, #12, #52
+# CHECK-NEXT:  1      1     0.50                        sxtb	w1, w2
+# CHECK-NEXT:  1      1     0.50                        sxtb	xzr, w3
+# CHECK-NEXT:  1      1     0.50                        sxth	w9, w10
+# CHECK-NEXT:  1      1     0.50                        sxth	x0, w1
+# CHECK-NEXT:  1      1     0.50                        sxtw	x3, w30
+# CHECK-NEXT:  1      1     0.50                        uxtb	w1, w2
+# CHECK-NEXT:  1      1     0.50                        uxth	w9, w10
+# CHECK-NEXT:  1      1     0.50                        ubfx	x3, x30, #0, #32
+# CHECK-NEXT:  1      1     0.50                        asr	w3, w2, #0
+# CHECK-NEXT:  1      1     0.50                        asr	w9, w10, #31
+# CHECK-NEXT:  1      1     0.50                        asr	x20, x21, #63
+# CHECK-NEXT:  1      1     0.50                        asr	w1, wzr, #3
+# CHECK-NEXT:  1      1     0.50                        lsr	w3, w2, #0
+# CHECK-NEXT:  1      1     0.50                        lsr	w9, w10, #31
+# CHECK-NEXT:  1      1     0.50                        lsr	x20, x21, #63
+# CHECK-NEXT:  1      1     0.50                        lsr	wzr, wzr, #3
+# CHECK-NEXT:  1      1     0.50                        lsr	w3, w2, #0
+# CHECK-NEXT:  1      1     0.50                        lsl	w9, w10, #31
+# CHECK-NEXT:  1      1     0.50                        lsl	x20, x21, #63
+# CHECK-NEXT:  1      1     0.50                        lsl	w1, wzr, #3
+# CHECK-NEXT:  1      1     0.50                        sbfx	w9, w10, #0, #1
+# CHECK-NEXT:  1      1     0.50                        sbfiz	x2, x3, #63, #1
+# CHECK-NEXT:  1      1     0.50                        asr	x19, x20, #0
+# CHECK-NEXT:  1      1     0.50                        sbfiz	x9, x10, #5, #59
+# CHECK-NEXT:  1      1     0.50                        asr	w9, w10, #0
+# CHECK-NEXT:  1      1     0.50                        sbfiz	w11, w12, #31, #1
+# CHECK-NEXT:  1      1     0.50                        sbfiz	w13, w14, #29, #3
+# CHECK-NEXT:  1      1     0.50                        sbfiz	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      1     0.50                        sbfx	w9, w10, #0, #1
+# CHECK-NEXT:  1      1     0.50                        asr	x2, x3, #63
+# CHECK-NEXT:  1      1     0.50                        asr	x19, x20, #0
+# CHECK-NEXT:  1      1     0.50                        asr	x9, x10, #5
+# CHECK-NEXT:  1      1     0.50                        asr	w9, w10, #0
+# CHECK-NEXT:  1      1     0.50                        asr	w11, w12, #31
+# CHECK-NEXT:  1      1     0.50                        asr	w13, w14, #29
+# CHECK-NEXT:  1      1     0.50                        sbfx	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      1     0.50                        bfxil	w9, w10, #0, #1
+# CHECK-NEXT:  1      1     0.50                        bfi	x2, x3, #63, #1
+# CHECK-NEXT:  1      1     0.50                        bfxil	x19, x20, #0, #64
+# CHECK-NEXT:  1      1     0.50                        bfi	x9, x10, #5, #59
+# CHECK-NEXT:  1      1     0.50                        bfxil	w9, w10, #0, #32
+# CHECK-NEXT:  1      1     0.50                        bfi	w11, w12, #31, #1
+# CHECK-NEXT:  1      1     0.50                        bfi	w13, w14, #29, #3
+# CHECK-NEXT:  1      1     0.50                        bfc	xzr, #10, #11
+# CHECK-NEXT:  1      1     0.50                        bfxil	w9, w10, #0, #1
+# CHECK-NEXT:  1      1     0.50                        bfxil	x2, x3, #63, #1
+# CHECK-NEXT:  1      1     0.50                        bfxil	x19, x20, #0, #64
+# CHECK-NEXT:  1      1     0.50                        bfxil	x9, x10, #5, #59
+# CHECK-NEXT:  1      1     0.50                        bfxil	w9, w10, #0, #32
+# CHECK-NEXT:  1      1     0.50                        bfxil	w11, w12, #31, #1
+# CHECK-NEXT:  1      1     0.50                        bfxil	w13, w14, #29, #3
+# CHECK-NEXT:  1      1     0.50                        bfxil	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      1     0.50                        ubfx	w9, w10, #0, #1
+# CHECK-NEXT:  1      1     0.50                        lsl	x2, x3, #63
+# CHECK-NEXT:  1      1     0.50                        lsr	x19, x20, #0
+# CHECK-NEXT:  1      1     0.50                        lsl	x9, x10, #5
+# CHECK-NEXT:  1      1     0.50                        lsr	w9, w10, #0
+# CHECK-NEXT:  1      1     0.50                        lsl	w11, w12, #31
+# CHECK-NEXT:  1      1     0.50                        lsl	w13, w14, #29
+# CHECK-NEXT:  1      1     0.50                        ubfiz	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      1     0.50                        ubfx	w9, w10, #0, #1
+# CHECK-NEXT:  1      1     0.50                        lsr	x2, x3, #63
+# CHECK-NEXT:  1      1     0.50                        lsr	x19, x20, #0
+# CHECK-NEXT:  1      1     0.50                        lsr	x9, x10, #5
+# CHECK-NEXT:  1      1     0.50                        lsr	w9, w10, #0
+# CHECK-NEXT:  1      1     0.50                        lsr	w11, w12, #31
+# CHECK-NEXT:  1      1     0.50                        lsr	w13, w14, #29
+# CHECK-NEXT:  1      1     0.50                        ubfx	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      1     0.50                        cbz	w5, #4
+# CHECK-NEXT:  1      1     0.50                        cbz	x5, #0
+# CHECK-NEXT:  1      1     0.50                        cbnz	x2, #-4
+# CHECK-NEXT:  1      1     0.50                        cbnz	x26, #1048572
+# CHECK-NEXT:  1      1     0.50                        cbz	wzr, #0
+# CHECK-NEXT:  1      1     0.50                        cbnz	xzr, #0
+# CHECK-NEXT:  1      1     0.50                        b.ne	#4
+# CHECK-NEXT:  1      1     0.50                        b.ge	#1048572
+# CHECK-NEXT:  1      1     0.50                        b.ge	#-4
+# CHECK-NEXT:  1      1     0.50                        ccmp	w1, #31, #0, eq
+# CHECK-NEXT:  1      1     0.50                        ccmp	w3, #0, #15, hs
+# CHECK-NEXT:  1      1     0.50                        ccmp	wzr, #15, #13, hs
+# CHECK-NEXT:  1      1     0.50                        ccmp	x9, #31, #0, le
+# CHECK-NEXT:  1      1     0.50                        ccmp	x3, #0, #15, gt
+# CHECK-NEXT:  1      1     0.50                        ccmp	xzr, #5, #7, ne
+# CHECK-NEXT:  1      1     0.50                        ccmn	w1, #31, #0, eq
+# CHECK-NEXT:  1      1     0.50                        ccmn	w3, #0, #15, hs
+# CHECK-NEXT:  1      1     0.50                        ccmn	wzr, #15, #13, hs
+# CHECK-NEXT:  1      1     0.50                        ccmn	x9, #31, #0, le
+# CHECK-NEXT:  1      1     0.50                        ccmn	x3, #0, #15, gt
+# CHECK-NEXT:  1      1     0.50                        ccmn	xzr, #5, #7, ne
+# CHECK-NEXT:  1      1     0.50                        ccmp	w1, wzr, #0, eq
+# CHECK-NEXT:  1      1     0.50                        ccmp	w3, w0, #15, hs
+# CHECK-NEXT:  1      1     0.50                        ccmp	wzr, w15, #13, hs
+# CHECK-NEXT:  1      1     0.50                        ccmp	x9, xzr, #0, le
+# CHECK-NEXT:  1      1     0.50                        ccmp	x3, x0, #15, gt
+# CHECK-NEXT:  1      1     0.50                        ccmp	xzr, x5, #7, ne
+# CHECK-NEXT:  1      1     0.50                        ccmn	w1, wzr, #0, eq
+# CHECK-NEXT:  1      1     0.50                        ccmn	w3, w0, #15, hs
+# CHECK-NEXT:  1      1     0.50                        ccmn	wzr, w15, #13, hs
+# CHECK-NEXT:  1      1     0.50                        ccmn	x9, xzr, #0, le
+# CHECK-NEXT:  1      1     0.50                        ccmn	x3, x0, #15, gt
+# CHECK-NEXT:  1      1     0.50                        ccmn	xzr, x5, #7, ne
+# CHECK-NEXT:  1      1     0.50                        csel	w1, w0, w19, ne
+# CHECK-NEXT:  1      1     0.50                        csel	wzr, w5, w9, eq
+# CHECK-NEXT:  1      1     0.50                        csel	w9, wzr, w30, gt
+# CHECK-NEXT:  1      1     0.50                        csel	w1, w28, wzr, mi
+# CHECK-NEXT:  1      1     0.50                        csel	x19, x23, x29, lt
+# CHECK-NEXT:  1      1     0.50                        csel	xzr, x3, x4, ge
+# CHECK-NEXT:  1      1     0.50                        csel	x5, xzr, x6, hs
+# CHECK-NEXT:  1      1     0.50                        csel	x7, x8, xzr, lo
+# CHECK-NEXT:  1      1     0.50                        csinc	w1, w0, w19, ne
+# CHECK-NEXT:  1      1     0.50                        csinc	wzr, w5, w9, eq
+# CHECK-NEXT:  1      1     0.50                        csinc	w9, wzr, w30, gt
+# CHECK-NEXT:  1      1     0.50                        csinc	w1, w28, wzr, mi
+# CHECK-NEXT:  1      1     0.50                        csinc	x19, x23, x29, lt
+# CHECK-NEXT:  1      1     0.50                        csinc	xzr, x3, x4, ge
+# CHECK-NEXT:  1      1     0.50                        csinc	x5, xzr, x6, hs
+# CHECK-NEXT:  1      1     0.50                        csinc	x7, x8, xzr, lo
+# CHECK-NEXT:  1      1     0.50                        csinv	w1, w0, w19, ne
+# CHECK-NEXT:  1      1     0.50                        csinv	wzr, w5, w9, eq
+# CHECK-NEXT:  1      1     0.50                        csinv	w9, wzr, w30, gt
+# CHECK-NEXT:  1      1     0.50                        csinv	w1, w28, wzr, mi
+# CHECK-NEXT:  1      1     0.50                        csinv	x19, x23, x29, lt
+# CHECK-NEXT:  1      1     0.50                        csinv	xzr, x3, x4, ge
+# CHECK-NEXT:  1      1     0.50                        csinv	x5, xzr, x6, hs
+# CHECK-NEXT:  1      1     0.50                        csinv	x7, x8, xzr, lo
+# CHECK-NEXT:  1      1     0.50                        csneg	w1, w0, w19, ne
+# CHECK-NEXT:  1      1     0.50                        csneg	wzr, w5, w9, eq
+# CHECK-NEXT:  1      1     0.50                        csneg	w9, wzr, w30, gt
+# CHECK-NEXT:  1      1     0.50                        csneg	w1, w28, wzr, mi
+# CHECK-NEXT:  1      1     0.50                        csneg	x19, x23, x29, lt
+# CHECK-NEXT:  1      1     0.50                        csneg	xzr, x3, x4, ge
+# CHECK-NEXT:  1      1     0.50                        csneg	x5, xzr, x6, hs
+# CHECK-NEXT:  1      1     0.50                        csneg	x7, x8, xzr, lo
+# CHECK-NEXT:  1      1     0.50                        cset	w3, eq
+# CHECK-NEXT:  1      1     0.50                        cset	x9, pl
+# CHECK-NEXT:  1      1     0.50                        csetm	w20, ne
+# CHECK-NEXT:  1      1     0.50                        csetm	x30, ge
+# CHECK-NEXT:  1      1     0.50                        csinc	w2, wzr, wzr, al
+# CHECK-NEXT:  1      1     0.50                        csinv	x3, xzr, xzr, nv
+# CHECK-NEXT:  1      1     0.50                        cinc	w3, w5, gt
+# CHECK-NEXT:  1      1     0.50                        cinc	wzr, w4, le
+# CHECK-NEXT:  1      1     0.50                        cset	w9, lt
+# CHECK-NEXT:  1      1     0.50                        cinc	x3, x5, gt
+# CHECK-NEXT:  1      1     0.50                        cinc	xzr, x4, le
+# CHECK-NEXT:  1      1     0.50                        cset	x9, lt
+# CHECK-NEXT:  1      1     0.50                        csinc	w5, w6, w6, nv
+# CHECK-NEXT:  1      1     0.50                        csinc	x1, x2, x2, al
+# CHECK-NEXT:  1      1     0.50                        cinv	w3, w5, gt
+# CHECK-NEXT:  1      1     0.50                        cinv	wzr, w4, le
+# CHECK-NEXT:  1      1     0.50                        csetm	w9, lt
+# CHECK-NEXT:  1      1     0.50                        cinv	x3, x5, gt
+# CHECK-NEXT:  1      1     0.50                        cinv	xzr, x4, le
+# CHECK-NEXT:  1      1     0.50                        csetm	x9, lt
+# CHECK-NEXT:  1      1     0.50                        csinv	x1, x0, x0, al
+# CHECK-NEXT:  1      1     0.50                        csinv	w9, w8, w8, nv
+# CHECK-NEXT:  1      1     0.50                        cneg	w3, w5, gt
+# CHECK-NEXT:  1      1     0.50                        cneg	wzr, w4, le
+# CHECK-NEXT:  1      1     0.50                        cneg	w9, wzr, lt
+# CHECK-NEXT:  1      1     0.50                        cneg	x3, x5, gt
+# CHECK-NEXT:  1      1     0.50                        cneg	xzr, x4, le
+# CHECK-NEXT:  1      1     0.50                        cneg	x9, xzr, lt
+# CHECK-NEXT:  1      1     0.50                        csneg	x4, x8, x8, al
+# CHECK-NEXT:  1      1     0.50                        csinv	w9, w8, w8, nv
+# CHECK-NEXT:  1      1     0.50                        rbit	w0, w7
+# CHECK-NEXT:  1      1     0.50                        rbit	x18, x3
+# CHECK-NEXT:  1      1     0.50                        rev16	w17, w1
+# CHECK-NEXT:  1      1     0.50                        rev16	x5, x2
+# CHECK-NEXT:  1      1     0.50                        rev	w18, w0
+# CHECK-NEXT:  1      1     0.50                        rev32	x20, x1
+# CHECK-NEXT:  1      1     0.50                        rev	x22, x2
+# CHECK-NEXT:  1      1     0.25                        clz	w24, w3
+# CHECK-NEXT:  1      1     0.25                        clz	x26, x4
+# CHECK-NEXT:  1      1     0.50                        cls	w3, w5
+# CHECK-NEXT:  1      1     0.50                        cls	x20, x5
+# CHECK-NEXT:  2      13    1.00                        udiv	w0, w7, w10
+# CHECK-NEXT:  3      13    2.00                        udiv	x9, x22, x4
+# CHECK-NEXT:  2      13    1.00                        sdiv	w12, w21, w0
+# CHECK-NEXT:  3      13    2.00                        sdiv	x13, x2, x1
+# CHECK-NEXT:  1      1     0.50                        lsl	w11, w12, w13
+# CHECK-NEXT:  1      1     0.50                        lsl	x14, x15, x16
+# CHECK-NEXT:  1      1     0.50                        lsr	w17, w18, w19
+# CHECK-NEXT:  1      1     0.50                        lsr	x20, x21, x22
+# CHECK-NEXT:  1      1     0.50                        asr	w23, w24, w25
+# CHECK-NEXT:  1      1     0.50                        asr	x26, x27, x28
+# CHECK-NEXT:  1      1     0.50                        ror	w0, w1, w2
+# CHECK-NEXT:  1      1     0.50                        ror	x3, x4, x5
+# CHECK-NEXT:  1      1     0.50                        lsl	w6, w7, w8
+# CHECK-NEXT:  1      1     0.50                        lsl	x9, x10, x11
+# CHECK-NEXT:  1      1     0.50                        lsr	w12, w13, w14
+# CHECK-NEXT:  1      1     0.50                        lsr	x15, x16, x17
+# CHECK-NEXT:  1      1     0.50                        asr	w18, w19, w20
+# CHECK-NEXT:  1      1     0.50                        asr	x21, x22, x23
+# CHECK-NEXT:  1      1     0.50                        ror	w24, w25, w26
+# CHECK-NEXT:  1      1     0.50                        ror	x27, x28, x29
+# CHECK-NEXT:  1      3     1.00                        smulh	x30, x29, x28
+# CHECK-NEXT:  1      3     1.00                        smulh	xzr, x27, x26
+# CHECK-NEXT:  1      3     1.00                        umulh	x30, x29, x28
+# CHECK-NEXT:  1      3     1.00                        umulh	x23, x30, xzr
+# CHECK-NEXT:  1      3     1.00                        madd	w1, w3, w7, w4
+# CHECK-NEXT:  1      3     1.00                        madd	wzr, w0, w9, w11
+# CHECK-NEXT:  1      3     1.00                        madd	w13, wzr, w4, w4
+# CHECK-NEXT:  1      3     1.00                        madd	w19, w30, wzr, w29
+# CHECK-NEXT:  1      3     1.00                        mul	w4, w5, w6
+# CHECK-NEXT:  1      3     1.00                        madd	x1, x3, x7, x4
+# CHECK-NEXT:  1      3     1.00                        madd	xzr, x0, x9, x11
+# CHECK-NEXT:  1      3     1.00                        madd	x13, xzr, x4, x4
+# CHECK-NEXT:  1      3     1.00                        madd	x19, x30, xzr, x29
+# CHECK-NEXT:  1      3     1.00                        mul	x4, x5, x6
+# CHECK-NEXT:  1      3     1.00                        msub	w1, w3, w7, w4
+# CHECK-NEXT:  1      3     1.00                        msub	wzr, w0, w9, w11
+# CHECK-NEXT:  1      3     1.00                        msub	w13, wzr, w4, w4
+# CHECK-NEXT:  1      3     1.00                        msub	w19, w30, wzr, w29
+# CHECK-NEXT:  1      3     1.00                        mneg	w4, w5, w6
+# CHECK-NEXT:  1      3     1.00                        msub	x1, x3, x7, x4
+# CHECK-NEXT:  1      3     1.00                        msub	xzr, x0, x9, x11
+# CHECK-NEXT:  1      3     1.00                        msub	x13, xzr, x4, x4
+# CHECK-NEXT:  1      3     1.00                        msub	x19, x30, xzr, x29
+# CHECK-NEXT:  1      3     1.00                        mneg	x4, x5, x6
+# CHECK-NEXT:  2      4     1.00                        smaddl	x3, w5, w2, x9
+# CHECK-NEXT:  2      4     1.00                        smaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  2      4     1.00                        smaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  2      4     1.00                        smaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  2      4     1.00                        smull	x19, w20, w21
+# CHECK-NEXT:  2      4     1.00                        smsubl	x3, w5, w2, x9
+# CHECK-NEXT:  2      4     1.00                        smsubl	xzr, w10, w11, x12
+# CHECK-NEXT:  2      4     1.00                        smsubl	x13, wzr, w14, x15
+# CHECK-NEXT:  2      4     1.00                        smsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  2      4     1.00                        smnegl	x19, w20, w21
+# CHECK-NEXT:  2      4     1.00                        umaddl	x3, w5, w2, x9
+# CHECK-NEXT:  2      4     1.00                        umaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  2      4     1.00                        umaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  2      4     1.00                        umaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  2      4     1.00                        umull	x19, w20, w21
+# CHECK-NEXT:  2      4     1.00                        umsubl	x3, w5, w2, x9
+# CHECK-NEXT:  2      4     1.00                        umsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  2      4     1.00                        umnegl	x19, w20, w21
+# CHECK-NEXT:  1      3     1.00                        smulh	x30, x29, x28
+# CHECK-NEXT:  1      3     1.00                        smulh	x23, x22, xzr
+# CHECK-NEXT:  1      3     1.00                        umulh	x23, x22, xzr
+# CHECK-NEXT:  1      3     1.00                        mul	x19, x20, xzr
+# CHECK-NEXT:  1      3     1.00                        mneg	w21, w22, w23
+# CHECK-NEXT:  2      4     1.00                        smull	x11, w13, w17
+# CHECK-NEXT:  2      4     1.00                        umull	x11, w13, w17
+# CHECK-NEXT:  2      4     1.00                        smnegl	x11, w13, w17
+# CHECK-NEXT:  2      4     1.00                        umnegl	x11, w13, w17
+# CHECK-NEXT:  1      1     0.50                        extr	w3, w5, w7, #0
+# CHECK-NEXT:  1      1     0.50                        extr	w11, w13, w17, #31
+# CHECK-NEXT:  1      1     0.50                        extr	x3, x5, x7, #15
+# CHECK-NEXT:  1      1     0.50                        extr	x11, x13, x17, #63
+# CHECK-NEXT:  1      1     0.50                        ror	x19, x23, #24
+# CHECK-NEXT:  1      1     0.50                        ror	x29, xzr, #63
+# CHECK-NEXT:  1      1     0.50                        ror	w9, w13, #31
+# CHECK-NEXT:  1      3     1.00                        fcmp	s3, s5
+# CHECK-NEXT:  1      3     1.00                        fcmp	s31, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmp	s31, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmpe	s29, s30
+# CHECK-NEXT:  1      3     1.00                        fcmpe	s15, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmpe	s15, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmp	d4, d12
+# CHECK-NEXT:  1      3     1.00                        fcmp	d23, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmp	d23, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmpe	d26, d22
+# CHECK-NEXT:  1      3     1.00                        fcmpe	d29, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmpe	d29, #0.0
+# CHECK-NEXT:  3      9     1.00                        fccmp	s1, s31, #0, eq
+# CHECK-NEXT:  3      9     1.00                        fccmp	s3, s0, #15, hs
+# CHECK-NEXT:  3      9     1.00                        fccmp	s31, s15, #13, hs
+# CHECK-NEXT:  3      9     1.00                        fccmp	d9, d31, #0, le
+# CHECK-NEXT:  3      9     1.00                        fccmp	d3, d0, #15, gt
+# CHECK-NEXT:  3      9     1.00                        fccmp	d31, d5, #7, ne
+# CHECK-NEXT:  3      9     1.00                        fccmpe	s1, s31, #0, eq
+# CHECK-NEXT:  3      9     1.00                        fccmpe	s3, s0, #15, hs
+# CHECK-NEXT:  3      9     1.00                        fccmpe	s31, s15, #13, hs
+# CHECK-NEXT:  3      9     1.00                        fccmpe	d9, d31, #0, le
+# CHECK-NEXT:  3      9     1.00                        fccmpe	d3, d0, #15, gt
+# CHECK-NEXT:  3      9     1.00                        fccmpe	d31, d5, #7, ne
+# CHECK-NEXT:  3      9     1.00                        fcsel	s3, s20, s9, pl
+# CHECK-NEXT:  3      9     1.00                        fcsel	d9, d10, d11, mi
+# CHECK-NEXT:  1      2     0.50                        fmov	s0, s1
+# CHECK-NEXT:  1      2     0.50                        fabs	s2, s3
+# CHECK-NEXT:  1      2     0.50                        fneg	s4, s5
+# CHECK-NEXT:  1      33    1.00                        fsqrt	s6, s7
+# CHECK-NEXT:  1      3     0.50                        fcvt	d8, s9
+# CHECK-NEXT:  1      3     0.50                        fcvt	h10, s11
+# CHECK-NEXT:  1      2     0.50                        frintn	s12, s13
+# CHECK-NEXT:  1      2     0.50                        frintp	s14, s15
+# CHECK-NEXT:  1      2     0.50                        frintm	s16, s17
+# CHECK-NEXT:  1      2     0.50                        frintz	s18, s19
+# CHECK-NEXT:  1      2     0.50                        frinta	s20, s21
+# CHECK-NEXT:  1      2     0.50                        frintx	s22, s23
+# CHECK-NEXT:  1      2     0.50                        frinti	s24, s25
+# CHECK-NEXT:  1      2     0.50                        fmov	d0, d1
+# CHECK-NEXT:  1      2     0.50                        fabs	d2, d3
+# CHECK-NEXT:  1      2     0.50                        fneg	d4, d5
+# CHECK-NEXT:  1      63    1.00                        fsqrt	d6, d7
+# CHECK-NEXT:  1      3     0.50                        fcvt	s8, d9
+# CHECK-NEXT:  1      3     0.50                        fcvt	h10, d11
+# CHECK-NEXT:  1      2     0.50                        frintn	d12, d13
+# CHECK-NEXT:  1      2     0.50                        frintp	d14, d15
+# CHECK-NEXT:  1      2     0.50                        frintm	d16, d17
+# CHECK-NEXT:  1      2     0.50                        frintz	d18, d19
+# CHECK-NEXT:  1      2     0.50                        frinta	d20, d21
+# CHECK-NEXT:  1      2     0.50                        frintx	d22, d23
+# CHECK-NEXT:  1      2     0.50                        frinti	d24, d25
+# CHECK-NEXT:  1      3     0.50                        fcvt	s26, h27
+# CHECK-NEXT:  1      3     0.50                        fcvt	d28, h29
+# CHECK-NEXT:  1      4     0.50                        fmul	s20, s19, s17
+# CHECK-NEXT:  1      12    1.00                        fdiv	s1, s2, s3
+# CHECK-NEXT:  1      2     0.50                        fadd	s4, s5, s6
+# CHECK-NEXT:  1      2     0.50                        fsub	s7, s8, s9
+# CHECK-NEXT:  1      2     0.50                        fmax	s10, s11, s12
+# CHECK-NEXT:  1      2     0.50                        fmin	s13, s14, s15
+# CHECK-NEXT:  1      2     0.50                        fmaxnm	s16, s17, s18
+# CHECK-NEXT:  1      2     0.50                        fminnm	s19, s20, s21
+# CHECK-NEXT:  1      4     0.50                        fnmul	s22, s23, s2
+# CHECK-NEXT:  1      4     0.50                        fmul	d20, d19, d17
+# CHECK-NEXT:  1      19    1.00                        fdiv	d1, d2, d3
+# CHECK-NEXT:  1      2     0.50                        fadd	d4, d5, d6
+# CHECK-NEXT:  1      2     0.50                        fsub	d7, d8, d9
+# CHECK-NEXT:  1      2     0.50                        fmax	d10, d11, d12
+# CHECK-NEXT:  1      2     0.50                        fmin	d13, d14, d15
+# CHECK-NEXT:  1      2     0.50                        fmaxnm	d16, d17, d18
+# CHECK-NEXT:  1      2     0.50                        fminnm	d19, d20, d21
+# CHECK-NEXT:  1      4     0.50                        fnmul	d22, d23, d24
+# CHECK-NEXT:  1      4     0.50                        fmadd	s3, s5, s6, s31
+# CHECK-NEXT:  1      4     0.50                        fmadd	d3, d13, d0, d23
+# CHECK-NEXT:  1      4     0.50                        fmsub	s3, s5, s6, s31
+# CHECK-NEXT:  1      4     0.50                        fmsub	d3, d13, d0, d23
+# CHECK-NEXT:  1      4     0.50                        fnmadd	s3, s5, s6, s31
+# CHECK-NEXT:  1      4     0.50                        fnmadd	d3, d13, d0, d23
+# CHECK-NEXT:  1      4     0.50                        fnmsub	s3, s5, s6, s31
+# CHECK-NEXT:  1      4     0.50                        fnmsub	d3, d13, d0, d23
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	w3, h5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	wzr, h20, #13
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	w19, h0, #32
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x3, h5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x12, h30, #45
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x19, h0, #64
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	w3, s5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	wzr, s20, #13
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	w19, s0, #32
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x3, s5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x12, s30, #45
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x19, s0, #64
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	w3, d5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	wzr, d20, #13
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	w19, d0, #32
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x3, d5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x12, d30, #45
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x19, d0, #64
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	w3, h5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	wzr, h20, #13
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	w19, h0, #32
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x3, h5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x12, h30, #45
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x19, h0, #64
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	w3, s5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	wzr, s20, #13
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	w19, s0, #32
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x3, s5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x12, s30, #45
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x19, s0, #64
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	w3, d5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	wzr, d20, #13
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	w19, d0, #32
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x3, d5, #1
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x12, d30, #45
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x19, d0, #64
+# CHECK-NEXT:  3      11    1.00                        scvtf	h23, w19, #1
+# CHECK-NEXT:  3      11    1.00                        scvtf	h31, wzr, #20
+# CHECK-NEXT:  3      11    1.00                        scvtf	h14, w0, #32
+# CHECK-NEXT:  3      11    1.00                        scvtf	h23, x19, #1
+# CHECK-NEXT:  3      11    1.00                        scvtf	h31, xzr, #20
+# CHECK-NEXT:  3      11    1.00                        scvtf	h14, x0, #64
+# CHECK-NEXT:  3      11    1.00                        scvtf	s23, w19, #1
+# CHECK-NEXT:  3      11    1.00                        scvtf	s31, wzr, #20
+# CHECK-NEXT:  3      11    1.00                        scvtf	s14, w0, #32
+# CHECK-NEXT:  3      11    1.00                        scvtf	s23, x19, #1
+# CHECK-NEXT:  3      11    1.00                        scvtf	s31, xzr, #20
+# CHECK-NEXT:  3      11    1.00                        scvtf	s14, x0, #64
+# CHECK-NEXT:  3      11    1.00                        scvtf	d23, w19, #1
+# CHECK-NEXT:  3      11    1.00                        scvtf	d31, wzr, #20
+# CHECK-NEXT:  3      11    1.00                        scvtf	d14, w0, #32
+# CHECK-NEXT:  3      11    1.00                        scvtf	d23, x19, #1
+# CHECK-NEXT:  3      11    1.00                        scvtf	d31, xzr, #20
+# CHECK-NEXT:  3      11    1.00                        scvtf	d14, x0, #64
+# CHECK-NEXT:  3      11    1.00                        ucvtf	h23, w19, #1
+# CHECK-NEXT:  3      11    1.00                        ucvtf	h31, wzr, #20
+# CHECK-NEXT:  3      11    1.00                        ucvtf	h14, w0, #32
+# CHECK-NEXT:  3      11    1.00                        ucvtf	h23, x19, #1
+# CHECK-NEXT:  3      11    1.00                        ucvtf	h31, xzr, #20
+# CHECK-NEXT:  3      11    1.00                        ucvtf	h14, x0, #64
+# CHECK-NEXT:  3      11    1.00                        ucvtf	s23, w19, #1
+# CHECK-NEXT:  3      11    1.00                        ucvtf	s31, wzr, #20
+# CHECK-NEXT:  3      11    1.00                        ucvtf	s14, w0, #32
+# CHECK-NEXT:  3      11    1.00                        ucvtf	s23, x19, #1
+# CHECK-NEXT:  3      11    1.00                        ucvtf	s31, xzr, #20
+# CHECK-NEXT:  3      11    1.00                        ucvtf	s14, x0, #64
+# CHECK-NEXT:  3      11    1.00                        ucvtf	d23, w19, #1
+# CHECK-NEXT:  3      11    1.00                        ucvtf	d31, wzr, #20
+# CHECK-NEXT:  3      11    1.00                        ucvtf	d14, w0, #32
+# CHECK-NEXT:  3      11    1.00                        ucvtf	d23, x19, #1
+# CHECK-NEXT:  3      11    1.00                        ucvtf	d31, xzr, #20
+# CHECK-NEXT:  3      11    1.00                        ucvtf	d14, x0, #64
+# CHECK-NEXT:  2      7     1.00                        fcvtns	w3, h31
+# CHECK-NEXT:  2      7     1.00                        fcvtns	xzr, h12
+# CHECK-NEXT:  2      7     1.00                        fcvtnu	wzr, h12
+# CHECK-NEXT:  2      7     1.00                        fcvtnu	x0, h0
+# CHECK-NEXT:  2      7     1.00                        fcvtps	wzr, h9
+# CHECK-NEXT:  2      7     1.00                        fcvtps	x12, h20
+# CHECK-NEXT:  2      7     1.00                        fcvtpu	w30, h23
+# CHECK-NEXT:  2      7     1.00                        fcvtpu	x29, h3
+# CHECK-NEXT:  2      7     1.00                        fcvtms	w2, h3
+# CHECK-NEXT:  2      7     1.00                        fcvtms	x4, h5
+# CHECK-NEXT:  2      7     1.00                        fcvtmu	w6, h7
+# CHECK-NEXT:  2      7     1.00                        fcvtmu	x8, h9
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	w10, h11
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x12, h13
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	w14, h15
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x15, h16
+# CHECK-NEXT:  3      11    1.00                        scvtf	h17, w18
+# CHECK-NEXT:  3      11    1.00                        scvtf	h19, x20
+# CHECK-NEXT:  3      11    1.00                        ucvtf	h21, w22
+# CHECK-NEXT:  3      11    1.00                        scvtf	h23, x24
+# CHECK-NEXT:  2      7     1.00                        fcvtas	w25, h26
+# CHECK-NEXT:  2      7     1.00                        fcvtas	x27, h28
+# CHECK-NEXT:  2      7     1.00                        fcvtau	w29, h30
+# CHECK-NEXT:  2      7     1.00                        fcvtau	xzr, h0
+# CHECK-NEXT:  2      7     1.00                        fcvtns	w3, s31
+# CHECK-NEXT:  2      7     1.00                        fcvtns	xzr, s12
+# CHECK-NEXT:  2      7     1.00                        fcvtnu	wzr, s12
+# CHECK-NEXT:  2      7     1.00                        fcvtnu	x0, s0
+# CHECK-NEXT:  2      7     1.00                        fcvtps	wzr, s9
+# CHECK-NEXT:  2      7     1.00                        fcvtps	x12, s20
+# CHECK-NEXT:  2      7     1.00                        fcvtpu	w30, s23
+# CHECK-NEXT:  2      7     1.00                        fcvtpu	x29, s3
+# CHECK-NEXT:  2      7     1.00                        fcvtms	w2, s3
+# CHECK-NEXT:  2      7     1.00                        fcvtms	x4, s5
+# CHECK-NEXT:  2      7     1.00                        fcvtmu	w6, s7
+# CHECK-NEXT:  2      7     1.00                        fcvtmu	x8, s9
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	w10, s11
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x12, s13
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	w14, s15
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x15, s16
+# CHECK-NEXT:  3      11    1.00                        scvtf	s17, w18
+# CHECK-NEXT:  3      11    1.00                        scvtf	s19, x20
+# CHECK-NEXT:  3      11    1.00                        ucvtf	s21, w22
+# CHECK-NEXT:  3      11    1.00                        scvtf	s23, x24
+# CHECK-NEXT:  2      7     1.00                        fcvtas	w25, s26
+# CHECK-NEXT:  2      7     1.00                        fcvtas	x27, s28
+# CHECK-NEXT:  2      7     1.00                        fcvtau	w29, s30
+# CHECK-NEXT:  2      7     1.00                        fcvtau	xzr, s0
+# CHECK-NEXT:  2      7     1.00                        fcvtns	w3, d31
+# CHECK-NEXT:  2      7     1.00                        fcvtns	xzr, d12
+# CHECK-NEXT:  2      7     1.00                        fcvtnu	wzr, d12
+# CHECK-NEXT:  2      7     1.00                        fcvtnu	x0, d0
+# CHECK-NEXT:  2      7     1.00                        fcvtps	wzr, d9
+# CHECK-NEXT:  2      7     1.00                        fcvtps	x12, d20
+# CHECK-NEXT:  2      7     1.00                        fcvtpu	w30, d23
+# CHECK-NEXT:  2      7     1.00                        fcvtpu	x29, d3
+# CHECK-NEXT:  2      7     1.00                        fcvtms	w2, d3
+# CHECK-NEXT:  2      7     1.00                        fcvtms	x4, d5
+# CHECK-NEXT:  2      7     1.00                        fcvtmu	w6, d7
+# CHECK-NEXT:  2      7     1.00                        fcvtmu	x8, d9
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	w10, d11
+# CHECK-NEXT:  2      7     1.00                        fcvtzs	x12, d13
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	w14, d15
+# CHECK-NEXT:  2      7     1.00                        fcvtzu	x15, d16
+# CHECK-NEXT:  3      11    1.00                        scvtf	d17, w18
+# CHECK-NEXT:  3      11    1.00                        scvtf	d19, x20
+# CHECK-NEXT:  3      11    1.00                        ucvtf	d21, w22
+# CHECK-NEXT:  3      11    1.00                        ucvtf	d23, x24
+# CHECK-NEXT:  2      7     1.00                        fcvtas	w25, d26
+# CHECK-NEXT:  2      7     1.00                        fcvtas	x27, d28
+# CHECK-NEXT:  2      7     1.00                        fcvtau	w29, d30
+# CHECK-NEXT:  2      7     1.00                        fcvtau	xzr, d0
+# CHECK-NEXT:  1      5     1.00                        fmov	w3, s9
+# CHECK-NEXT:  1      3     1.00                        fmov	s9, w3
+# CHECK-NEXT:  1      5     1.00                        fmov	x20, d31
+# CHECK-NEXT:  1      3     1.00                        fmov	d1, x15
+# CHECK-NEXT:  2      7     1.00                        fmov	x3, v12.d[1]
+# CHECK-NEXT:  1      5     1.00                        fmov	v1.d[1], x19
+# CHECK-NEXT:  1      2     0.50                        fmov	s2, #0.12500000
+# CHECK-NEXT:  1      2     0.50                        fmov	s3, #1.00000000
+# CHECK-NEXT:  1      2     0.50                        fmov	d30, #16.00000000
+# CHECK-NEXT:  1      2     0.50                        fmov	s4, #1.06250000
+# CHECK-NEXT:  1      2     0.50                        fmov	d10, #1.93750000
+# CHECK-NEXT:  1      2     0.50                        fmov	s12, #-1.00000000
+# CHECK-NEXT:  1      2     0.50                        fmov	d16, #8.50000000
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w3, #0
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x29, #4
+# CHECK-NEXT:  1      3     0.50    *                   ldrsw	xzr, #-4
+# CHECK-NEXT:  1      3     0.50    *                   ldr	s0, #8
+# CHECK-NEXT:  1      3     0.50    *                   ldr	d0, #1048572
+# CHECK-NEXT:  1      3     0.50    *                   ldr	q0, #-1048576
+# CHECK-NEXT:  1      1     0.50                  U     prfm	pldl1strm, #0
+# CHECK-NEXT:  1      1     0.50                  U     prfm	#22, #0
+# CHECK-NEXT:  2      4     0.50    *      *      U     stxrb	w18, w8, [sp]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stxrh	w24, w15, [x16]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stxr	w5, w6, [x17]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stxr	w1, x10, [x21]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldxrb	w30, [x0]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldxrh	w17, [x4]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldxr	w22, [sp]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldxr	x11, [x29]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldxr	x11, [x29]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldxr	x11, [x29]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stxp	w12, w11, w10, [sp]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stxp	wzr, x27, x9, [x12]
+# CHECK-NEXT:  2      3     0.50    *      *      U     ldxp	w0, wzr, [sp]
+# CHECK-NEXT:  2      3     0.50    *      *      U     ldxp	x17, x0, [x18]
+# CHECK-NEXT:  2      3     0.50    *      *      U     ldxp	x17, x0, [x18]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stlxrb	w12, w22, [x0]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stlxrh	w10, w1, [x1]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stlxr	w9, w2, [x2]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stlxr	w9, x3, [sp]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldaxrb	w8, [x4]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldaxrh	w7, [x5]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldaxr	w6, [sp]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldaxr	x5, [x6]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldaxr	x5, [x6]
+# CHECK-NEXT:  1      3     0.50    *      *      U     ldaxr	x5, [x6]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stlxp	w4, w5, w6, [sp]
+# CHECK-NEXT:  2      4     0.50    *      *      U     stlxp	wzr, x6, x7, [x1]
+# CHECK-NEXT:  2      3     0.50    *      *      U     ldaxp	w5, w18, [sp]
+# CHECK-NEXT:  2      3     0.50    *      *      U     ldaxp	x6, x19, [x22]
+# CHECK-NEXT:  2      3     0.50    *      *      U     ldaxp	x6, x19, [x22]
+# CHECK-NEXT:  1      1     0.50           *      U     stlrb	w24, [sp]
+# CHECK-NEXT:  1      1     0.50           *      U     stlrh	w25, [x30]
+# CHECK-NEXT:  1      1     0.50           *      U     stlr	w26, [x29]
+# CHECK-NEXT:  1      1     0.50           *      U     stlr	x27, [x28]
+# CHECK-NEXT:  1      1     0.50           *      U     stlr	x27, [x28]
+# CHECK-NEXT:  1      1     0.50           *      U     stlr	x27, [x28]
+# CHECK-NEXT:  1      3     0.50    *             U     ldarb	w23, [sp]
+# CHECK-NEXT:  1      3     0.50    *             U     ldarh	w22, [x30]
+# CHECK-NEXT:  1      3     0.50    *             U     ldar	wzr, [x29]
+# CHECK-NEXT:  1      3     0.50    *             U     ldar	x21, [x28]
+# CHECK-NEXT:  1      3     0.50    *             U     ldar	x21, [x28]
+# CHECK-NEXT:  1      3     0.50    *             U     ldar	x21, [x28]
+# CHECK-NEXT:  1      1     0.50           *            sturb	w9, [sp]
+# CHECK-NEXT:  1      1     0.50           *            sturh	wzr, [x12, #255]
+# CHECK-NEXT:  1      1     0.50           *            stur	w16, [x0, #-256]
+# CHECK-NEXT:  1      1     0.50           *            stur	x28, [x14, #1]
+# CHECK-NEXT:  1      3     0.50    *                   ldurb	w1, [x20, #255]
+# CHECK-NEXT:  1      3     0.50    *                   ldurh	w20, [x1, #255]
+# CHECK-NEXT:  1      3     0.50    *                   ldur	w12, [sp, #255]
+# CHECK-NEXT:  1      3     0.50    *                   ldur	xzr, [x12, #255]
+# CHECK-NEXT:  1      3     0.50    *                   ldursb	x9, [x7, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldursh	x17, [x19, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldursw	x20, [x15, #-256]
+# CHECK-NEXT:  1      1     0.50                  U     prfum	pldl2keep, [sp, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldursb	w19, [x1, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldursh	w15, [x21, #-256]
+# CHECK-NEXT:  2      2     1.00           *            stur	b0, [sp, #1]
+# CHECK-NEXT:  2      2     1.00           *            stur	h12, [x12, #-1]
+# CHECK-NEXT:  2      2     1.00           *            stur	s15, [x0, #255]
+# CHECK-NEXT:  2      2     1.00           *            stur	d31, [x5, #25]
+# CHECK-NEXT:  2      2     1.00           *            stur	q9, [x5]
+# CHECK-NEXT:  1      4     0.50    *                   ldur	b3, [sp]
+# CHECK-NEXT:  1      4     0.50    *                   ldur	h5, [x4, #-256]
+# CHECK-NEXT:  1      4     0.50    *                   ldur	s7, [x12, #-1]
+# CHECK-NEXT:  1      4     0.50    *                   ldur	d11, [x19, #4]
+# CHECK-NEXT:  1      4     0.50    *                   ldur	q13, [x1, #2]
+# CHECK-NEXT:  2      1     0.50           *            strb	w9, [x2], #255
+# CHECK-NEXT:  2      1     0.50           *            strb	w10, [x3], #1
+# CHECK-NEXT:  2      1     0.50           *            strb	w10, [x3], #-256
+# CHECK-NEXT:  2      1     0.50           *            strh	w9, [x2], #255
+# CHECK-NEXT:  2      1     0.50           *            strh	w9, [x2], #1
+# CHECK-NEXT:  2      1     0.50           *            strh	w10, [x3], #-256
+# CHECK-NEXT:  2      1     0.50           *            str	w19, [sp], #255
+# CHECK-NEXT:  2      1     0.50           *            str	w20, [x30], #1
+# CHECK-NEXT:  2      1     0.50           *            str	w21, [x12], #-256
+# CHECK-NEXT:  2      1     0.50           *            str	xzr, [x9], #255
+# CHECK-NEXT:  2      1     0.50           *            str	x2, [x3], #1
+# CHECK-NEXT:  2      1     0.50           *            str	x19, [x12], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldrb	w9, [x2], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldrb	w10, [x3], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldrb	w10, [x3], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldrh	w9, [x2], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldrh	w9, [x2], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldrh	w10, [x3], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldr	w19, [sp], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldr	w20, [x30], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldr	w21, [x12], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldr	xzr, [x9], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldr	x2, [x3], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldr	x19, [x12], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	xzr, [x9], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	x2, [x3], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	x19, [x12], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	xzr, [x9], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	x2, [x3], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	x19, [x12], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldrsw	xzr, [x9], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldrsw	x2, [x3], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldrsw	x19, [x12], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	wzr, [x9], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	w2, [x3], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	w19, [x12], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	wzr, [x9], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	w2, [x3], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	w19, [x12], #-256
+# CHECK-NEXT:  2      1     0.50           *            str	b0, [x0], #255
+# CHECK-NEXT:  2      1     0.50           *            str	b3, [x3], #1
+# CHECK-NEXT:  2      1     0.50           *            str	b5, [sp], #-256
+# CHECK-NEXT:  2      1     0.50           *            str	h10, [x10], #255
+# CHECK-NEXT:  2      1     0.50           *            str	h13, [x23], #1
+# CHECK-NEXT:  2      1     0.50           *            str	h15, [sp], #-256
+# CHECK-NEXT:  2      1     0.50           *            str	s20, [x20], #255
+# CHECK-NEXT:  2      1     0.50           *            str	s23, [x23], #1
+# CHECK-NEXT:  2      1     0.50           *            str	s25, [x0], #-256
+# CHECK-NEXT:  2      1     0.50           *            str	d20, [x20], #255
+# CHECK-NEXT:  2      1     0.50           *            str	d23, [x23], #1
+# CHECK-NEXT:  2      1     0.50           *            str	d25, [x0], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldr	b0, [x0], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldr	b3, [x3], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldr	b5, [sp], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldr	h10, [x10], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldr	h13, [x23], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldr	h15, [sp], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldr	s20, [x20], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldr	s23, [x23], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldr	s25, [x0], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldr	d20, [x20], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldr	d23, [x23], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldr	d25, [x0], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldr	q20, [x1], #255
+# CHECK-NEXT:  2      3     0.50    *                   ldr	q23, [x9], #1
+# CHECK-NEXT:  2      3     0.50    *                   ldr	q25, [x20], #-256
+# CHECK-NEXT:  2      1     0.50           *            str	q10, [x1], #255
+# CHECK-NEXT:  2      1     0.50           *            str	q22, [sp], #1
+# CHECK-NEXT:  2      1     0.50           *            str	q21, [x20], #-256
+# CHECK-NEXT:  2      3     0.50    *                   ldr	x3, [x4, #0]!
+# CHECK-NEXT:  2      1     0.50           *            strb	w9, [x2, #255]!
+# CHECK-NEXT:  2      1     0.50           *            strb	w10, [x3, #1]!
+# CHECK-NEXT:  2      1     0.50           *            strb	w10, [x3, #-256]!
+# CHECK-NEXT:  2      1     0.50           *            strh	w9, [x2, #255]!
+# CHECK-NEXT:  2      1     0.50           *            strh	w9, [x2, #1]!
+# CHECK-NEXT:  2      1     0.50           *            strh	w10, [x3, #-256]!
+# CHECK-NEXT:  2      1     0.50           *            str	w19, [sp, #255]!
+# CHECK-NEXT:  2      1     0.50           *            str	w20, [x30, #1]!
+# CHECK-NEXT:  2      1     0.50           *            str	w21, [x12, #-256]!
+# CHECK-NEXT:  2      1     0.50           *            str	xzr, [x9, #255]!
+# CHECK-NEXT:  2      1     0.50           *            str	x2, [x3, #1]!
+# CHECK-NEXT:  2      1     0.50           *            str	x19, [x12, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrb	w9, [x2, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrb	w10, [x3, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrb	w10, [x3, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrh	w9, [x2, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrh	w9, [x2, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrh	w10, [x3, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	w19, [sp, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	w20, [x30, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	w21, [x12, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	xzr, [x9, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	x2, [x3, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	x19, [x12, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	xzr, [x9, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	x2, [x3, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	x19, [x12, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	xzr, [x9, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	x2, [x3, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	x19, [x12, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsw	xzr, [x9, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsw	x2, [x3, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsw	x19, [x12, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	wzr, [x9, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	w2, [x3, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsb	w19, [x12, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	wzr, [x9, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	w2, [x3, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldrsh	w19, [x12, #-256]!
+# CHECK-NEXT:  2      1     0.50           *            str	b0, [x0, #255]!
+# CHECK-NEXT:  2      1     0.50           *            str	b3, [x3, #1]!
+# CHECK-NEXT:  2      1     0.50           *            str	b5, [sp, #-256]!
+# CHECK-NEXT:  2      1     0.50           *            str	h10, [x10, #255]!
+# CHECK-NEXT:  2      1     0.50           *            str	h13, [x23, #1]!
+# CHECK-NEXT:  2      1     0.50           *            str	h15, [sp, #-256]!
+# CHECK-NEXT:  2      1     0.50           *            str	s20, [x20, #255]!
+# CHECK-NEXT:  2      1     0.50           *            str	s23, [x23, #1]!
+# CHECK-NEXT:  2      1     0.50           *            str	s25, [x0, #-256]!
+# CHECK-NEXT:  2      1     0.50           *            str	d20, [x20, #255]!
+# CHECK-NEXT:  2      1     0.50           *            str	d23, [x23, #1]!
+# CHECK-NEXT:  2      1     0.50           *            str	d25, [x0, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	b0, [x0, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	b3, [x3, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	b5, [sp, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	h10, [x10, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	h13, [x23, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	h15, [sp, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	s20, [x20, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	s23, [x23, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	s25, [x0, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	d20, [x20, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	d23, [x23, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	d25, [x0, #-256]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	q20, [x1, #255]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	q23, [x9, #1]!
+# CHECK-NEXT:  2      3     0.50    *                   ldr	q25, [x20, #-256]!
+# CHECK-NEXT:  2      1     0.50           *            str	q10, [x1, #255]!
+# CHECK-NEXT:  2      1     0.50           *            str	q22, [sp, #1]!
+# CHECK-NEXT:  2      1     0.50           *            str	q21, [x20, #-256]!
+# CHECK-NEXT:  1      1     0.50           *            sttrb	w9, [sp]
+# CHECK-NEXT:  1      1     0.50           *            sttrh	wzr, [x12, #255]
+# CHECK-NEXT:  1      1     0.50           *            sttr	w16, [x0, #-256]
+# CHECK-NEXT:  1      1     0.50           *            sttr	x28, [x14, #1]
+# CHECK-NEXT:  1      3     0.50    *                   ldtrb	w1, [x20, #255]
+# CHECK-NEXT:  1      3     0.50    *                   ldtrh	w20, [x1, #255]
+# CHECK-NEXT:  1      3     0.50    *                   ldtr	w12, [sp, #255]
+# CHECK-NEXT:  1      3     0.50    *                   ldtr	xzr, [x12, #255]
+# CHECK-NEXT:  1      3     0.50    *                   ldtrsb	x9, [x7, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldtrsh	x17, [x19, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldtrsw	x20, [x15, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldtrsb	w19, [x1, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldtrsh	w15, [x21, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x4, [x29]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x30, [x12, #32760]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x20, [sp, #8]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	xzr, [sp]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w2, [sp]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w17, [sp, #16380]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w13, [x2, #4]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsw	x2, [x5, #4]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsw	x23, [sp, #16380]
+# CHECK-NEXT:  1      3     0.50    *                   ldrh	w2, [x4]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsh	w23, [x6, #8190]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsh	wzr, [sp, #2]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsh	x29, [x2, #2]
+# CHECK-NEXT:  1      3     0.50    *                   ldrb	w26, [x3, #121]
+# CHECK-NEXT:  1      3     0.50    *                   ldrb	w12, [x2]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsb	w27, [sp, #4095]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsb	xzr, [x15]
+# CHECK-NEXT:  1      1     0.50           *            str	x30, [sp]
+# CHECK-NEXT:  1      1     0.50           *            str	w20, [x4, #16380]
+# CHECK-NEXT:  1      1     0.50           *            strh	w17, [sp, #8190]
+# CHECK-NEXT:  1      1     0.50           *            strb	w23, [x3, #4095]
+# CHECK-NEXT:  1      1     0.50           *            strb	wzr, [x2]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	b31, [sp, #4095]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	h20, [x2, #8190]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	s10, [x19, #16380]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	d3, [x10, #32760]
+# CHECK-NEXT:  2      2     1.00           *            str	q12, [sp, #65520]
+# CHECK-NEXT:  1      3     0.50    *                   ldrb	w3, [sp, x5]
+# CHECK-NEXT:  1      3     0.50    *                   ldrb	w9, [x27, x6]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsb	w10, [x30, x7]
+# CHECK-NEXT:  1      3     0.50    *                   ldrb	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  2      1     1.00           *            strb	w12, [x28, xzr, sxtx]
+# CHECK-NEXT:  1      3     0.50    *                   ldrb	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsb	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldrb	w17, [x23, w9, sxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsb	x18, [x22, w10, sxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsh	w3, [sp, x5]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsh	w9, [x27, x6]
+# CHECK-NEXT:  1      3     0.50    *                   ldrh	w10, [x30, x7, lsl #1]
+# CHECK-NEXT:  2      1     1.00           *            strh	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  1      3     0.50    *                   ldrh	w12, [x28, xzr, sxtx]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsh	x13, [x27, x5, sxtx #1]
+# CHECK-NEXT:  1      3     0.50    *                   ldrh	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldrh	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsh	w16, [x24, w8, uxtw #1]
+# CHECK-NEXT:  1      3     0.50    *                   ldrh	w17, [x23, w9, sxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldrh	w18, [x22, w10, sxtw]
+# CHECK-NEXT:  2      1     1.00           *            strh	w19, [x21, wzr, sxtw #1]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w3, [sp, x5]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	s9, [x27, x6]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w10, [x30, x7, lsl #2]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  2      2     1.00           *            str	s12, [x28, xzr, sxtx]
+# CHECK-NEXT:  2      1     1.00           *            str	w13, [x27, x5, sxtx #2]
+# CHECK-NEXT:  2      1     1.00           *            str	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w16, [x24, w8, uxtw #2]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsw	x17, [x23, w9, sxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	w18, [x22, w10, sxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldrsw	x19, [x21, wzr, sxtw #2]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x3, [sp, x5]
+# CHECK-NEXT:  2      1     1.00           *            str	x9, [x27, x6]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	d10, [x30, x7, lsl #3]
+# CHECK-NEXT:  2      1     1.00           *            str	x11, [x29, x3, sxtx]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x12, [x28, xzr, sxtx]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x13, [x27, x5, sxtx #3]
+# CHECK-NEXT:  1      1     0.50                  U     prfm	pldl1keep, [x26, w6, uxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x16, [x24, w8, uxtw #3]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x17, [x23, w9, sxtw]
+# CHECK-NEXT:  1      3     0.50    *                   ldr	x18, [x22, w10, sxtw]
+# CHECK-NEXT:  2      2     1.00           *            str	d19, [x21, wzr, sxtw #3]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	q3, [sp, x5]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	q9, [x27, x6]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	q10, [x30, x7, lsl #4]
+# CHECK-NEXT:  2      2     1.00           *            str	q11, [x29, x3, sxtx]
+# CHECK-NEXT:  2      2     1.00           *            str	q12, [x28, xzr, sxtx]
+# CHECK-NEXT:  2      2     1.00           *            str	q13, [x27, x5, sxtx #4]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	q14, [x26, w6, uxtw]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	q15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	q16, [x24, w8, uxtw #4]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	q17, [x23, w9, sxtw]
+# CHECK-NEXT:  2      2     1.00           *            str	q18, [x22, w10, sxtw]
+# CHECK-NEXT:  1      4     0.50    *                   ldr	q19, [x21, wzr, sxtw #4]
+# CHECK-NEXT:  1      3     0.50    *                   ldp	w3, w5, [sp]
+# CHECK-NEXT:  2      2     0.50           *            stp	wzr, w9, [sp, #252]
+# CHECK-NEXT:  1      3     0.50    *                   ldp	w2, wzr, [sp, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldp	w9, w10, [sp, #4]
+# CHECK-NEXT:  2      5     1.00    *                   ldpsw	x9, x10, [sp, #4]
+# CHECK-NEXT:  2      5     1.00    *                   ldpsw	x9, x10, [x2, #-256]
+# CHECK-NEXT:  2      5     1.00    *                   ldpsw	x20, x30, [sp, #252]
+# CHECK-NEXT:  1      3     0.50    *                   ldp	x21, x29, [x2, #504]
+# CHECK-NEXT:  1      3     0.50    *                   ldp	x22, x23, [x3, #-512]
+# CHECK-NEXT:  1      3     0.50    *                   ldp	x24, x25, [x4, #8]
+# CHECK-NEXT:  2      5     1.00    *                   ldp	s29, s28, [sp, #252]
+# CHECK-NEXT:  4      3     2.00           *            stp	s27, s26, [sp, #-256]
+# CHECK-NEXT:  2      5     1.00    *                   ldp	s1, s2, [x3, #44]
+# CHECK-NEXT:  4      3     2.00           *            stp	d3, d5, [x9, #504]
+# CHECK-NEXT:  4      3     2.00           *            stp	d7, d11, [x10, #-512]
+# CHECK-NEXT:  2      5     1.00    *                   ldp	d2, d3, [x30, #-8]
+# CHECK-NEXT:  4      3     2.00           *            stp	q3, q5, [sp]
+# CHECK-NEXT:  4      3     2.00           *            stp	q17, q19, [sp, #1008]
+# CHECK-NEXT:  2      4     1.00    *                   ldp	q23, q29, [x1, #-1024]
+# CHECK-NEXT:  1      3     0.50    *                   ldp	w3, w5, [sp], #0
+# CHECK-NEXT:  3      2     0.50           *            stp	wzr, w9, [sp], #252
+# CHECK-NEXT:  1      3     0.50    *                   ldp	w2, wzr, [sp], #-256
+# CHECK-NEXT:  1      3     0.50    *                   ldp	w9, w10, [sp], #4
+# CHECK-NEXT:  2      5     1.00    *                   ldpsw	x9, x10, [sp], #4
+# CHECK-NEXT:  2      5     1.00    *                   ldpsw	x9, x10, [x2], #-256
+# CHECK-NEXT:  2      5     1.00    *                   ldpsw	x20, x30, [sp], #252
+# CHECK-NEXT:  1      3     0.50    *                   ldp	x21, x29, [x2], #504
+# CHECK-NEXT:  1      3     0.50    *                   ldp	x22, x23, [x3], #-512
+# CHECK-NEXT:  1      3     0.50    *                   ldp	x24, x25, [x4], #8
+# CHECK-NEXT:  2      5     1.00    *                   ldp	s29, s28, [sp], #252
+# CHECK-NEXT:  4      3     2.00           *            stp	s27, s26, [sp], #-256
+# CHECK-NEXT:  2      5     1.00    *                   ldp	s1, s2, [x3], #44
+# CHECK-NEXT:  4      3     2.00           *            stp	d3, d5, [x9], #504
+# CHECK-NEXT:  4      3     2.00           *            stp	d7, d11, [x10], #-512
+# CHECK-NEXT:  2      5     1.00    *                   ldp	d2, d3, [x30], #-8
+# CHECK-NEXT:  4      3     2.00           *            stp	q3, q5, [sp], #0
+# CHECK-NEXT:  4      3     2.00           *            stp	q17, q19, [sp], #1008
+# CHECK-NEXT:  2      4     1.00    *                   ldp	q23, q29, [x1], #-1024
+# CHECK-NEXT:  1      3     0.50    *                   ldp	w3, w5, [sp, #0]!
+# CHECK-NEXT:  3      2     0.50           *            stp	wzr, w9, [sp, #252]!
+# CHECK-NEXT:  1      3     0.50    *                   ldp	w2, wzr, [sp, #-256]!
+# CHECK-NEXT:  1      3     0.50    *                   ldp	w9, w10, [sp, #4]!
+# CHECK-NEXT:  2      5     1.00    *                   ldpsw	x9, x10, [sp, #4]!
+# CHECK-NEXT:  2      5     1.00    *                   ldpsw	x9, x10, [x2, #-256]!
+# CHECK-NEXT:  2      5     1.00    *                   ldpsw	x20, x30, [sp, #252]!
+# CHECK-NEXT:  1      3     0.50    *                   ldp	x21, x29, [x2, #504]!
+# CHECK-NEXT:  1      3     0.50    *                   ldp	x22, x23, [x3, #-512]!
+# CHECK-NEXT:  1      3     0.50    *                   ldp	x24, x25, [x4, #8]!
+# CHECK-NEXT:  2      5     1.00    *                   ldp	s29, s28, [sp, #252]!
+# CHECK-NEXT:  4      3     2.00           *            stp	s27, s26, [sp, #-256]!
+# CHECK-NEXT:  2      5     1.00    *                   ldp	s1, s2, [x3, #44]!
+# CHECK-NEXT:  4      3     2.00           *            stp	d3, d5, [x9, #504]!
+# CHECK-NEXT:  4      3     2.00           *            stp	d7, d11, [x10, #-512]!
+# CHECK-NEXT:  2      5     1.00    *                   ldp	d2, d3, [x30, #-8]!
+# CHECK-NEXT:  4      3     2.00           *            stp	q3, q5, [sp, #0]!
+# CHECK-NEXT:  4      3     2.00           *            stp	q17, q19, [sp, #1008]!
+# CHECK-NEXT:  2      4     1.00    *                   ldp	q23, q29, [x1, #-1024]!
+# CHECK-NEXT:  1      3     0.50    *                   ldnp	w3, w5, [sp]
+# CHECK-NEXT:  2      1     1.00           *            stnp	wzr, w9, [sp, #252]
+# CHECK-NEXT:  1      3     0.50    *                   ldnp	w2, wzr, [sp, #-256]
+# CHECK-NEXT:  1      3     0.50    *                   ldnp	w9, w10, [sp, #4]
+# CHECK-NEXT:  1      3     0.50    *                   ldnp	x21, x29, [x2, #504]
+# CHECK-NEXT:  1      3     0.50    *                   ldnp	x22, x23, [x3, #-512]
+# CHECK-NEXT:  1      3     0.50    *                   ldnp	x24, x25, [x4, #8]
+# CHECK-NEXT:  2      5     1.00    *                   ldnp	s29, s28, [sp, #252]
+# CHECK-NEXT:  4      3     2.00           *            stnp	s27, s26, [sp, #-256]
+# CHECK-NEXT:  2      5     1.00    *                   ldnp	s1, s2, [x3, #44]
+# CHECK-NEXT:  4      3     2.00           *            stnp	d3, d5, [x9, #504]
+# CHECK-NEXT:  4      3     2.00           *            stnp	d7, d11, [x10, #-512]
+# CHECK-NEXT:  2      5     1.00    *                   ldnp	d2, d3, [x30, #-8]
+# CHECK-NEXT:  4      3     2.00           *            stnp	q3, q5, [sp]
+# CHECK-NEXT:  4      3     2.00           *            stnp	q17, q19, [sp, #1008]
+# CHECK-NEXT:  2      4     1.00    *                   ldnp	q23, q29, [x1, #-1024]
+# CHECK-NEXT:  1      1     0.25                        mov	w3, #983055
+# CHECK-NEXT:  1      1     0.25                        mov	x10, #-6148914691236517206
+# CHECK-NEXT:  1      1     0.25                        and	w12, w23, w21
+# CHECK-NEXT:  1      1     0.25                        and	w16, w15, w1, lsl #1
+# CHECK-NEXT:  2      2     0.50                        and	w9, w4, w10, lsl #31
+# CHECK-NEXT:  1      1     0.25                        and	w3, w30, w11
+# CHECK-NEXT:  2      2     0.50                        and	x3, x5, x7, lsl #63
+# CHECK-NEXT:  2      2     0.50                        and	x5, x14, x19, asr #4
+# CHECK-NEXT:  2      2     0.50                        and	w3, w17, w19, ror #31
+# CHECK-NEXT:  2      2     0.50                        and	w0, w2, wzr, lsr #17
+# CHECK-NEXT:  2      2     0.50                        and	w3, w30, w11, asr #2
+# CHECK-NEXT:  1      1     0.25                        and	xzr, x4, x26
+# CHECK-NEXT:  2      2     0.50                        and	w3, wzr, w20, ror #2
+# CHECK-NEXT:  2      2     0.50                        and	x7, x20, xzr, asr #63
+# CHECK-NEXT:  2      2     0.50                        bic	x13, x20, x14, lsl #47
+# CHECK-NEXT:  1      1     0.25                        bic	w2, w7, w9
+# CHECK-NEXT:  2      2     0.50                        orr	w2, w7, w0, asr #31
+# CHECK-NEXT:  2      2     0.50                        orr	x8, x9, x10, lsl #12
+# CHECK-NEXT:  2      2     0.50                        orn	x3, x5, x7, asr #2
+# CHECK-NEXT:  1      1     0.25                        orn	w2, w5, w29
+# CHECK-NEXT:  1      1     0.25                        ands	w7, wzr, w9, lsl #1
+# CHECK-NEXT:  2      2     0.50                        ands	x3, x5, x20, ror #63
+# CHECK-NEXT:  1      1     0.25                        bics	w3, w5, w7
+# CHECK-NEXT:  1      1     0.25                        bics	x3, xzr, x3, lsl #1
+# CHECK-NEXT:  2      2     0.50                        tst	w3, w7, lsl #31
+# CHECK-NEXT:  2      2     0.50                        tst	x2, x20, asr #2
+# CHECK-NEXT:  1      1     0.25                        mov	x3, x6
+# CHECK-NEXT:  1      1     0.25                        mov	x3, xzr
+# CHECK-NEXT:  1      1     0.25                        mov	wzr, w2
+# CHECK-NEXT:  1      1     0.25                        mov	w3, w5
+# CHECK-NEXT:  1      1     0.25                        movz	w2, #0, lsl #16
+# CHECK-NEXT:  1      1     0.25                        mov	w2, #-1235
+# CHECK-NEXT:  1      1     0.25                        mov	x2, #5299989643264
+# CHECK-NEXT:  1      1     0.25                        mov	x2, #0
+# CHECK-NEXT:  1      1     0.25                        movk	w3, #0
+# CHECK-NEXT:  1      1     0.25                        movz	x4, #0, lsl #16
+# CHECK-NEXT:  1      1     0.25                        movk	w5, #0, lsl #16
+# CHECK-NEXT:  1      1     0.25                        movz	x6, #0, lsl #32
+# CHECK-NEXT:  1      1     0.25                        movk	x7, #0, lsl #32
+# CHECK-NEXT:  1      1     0.25                        movz	x8, #0, lsl #48
+# CHECK-NEXT:  1      1     0.25                        movk	x9, #0, lsl #48
+# CHECK-NEXT:  1      1     0.50                        adr	x2, #1600
+# CHECK-NEXT:  1      1     0.50                        adrp	x21, #6553600
+# CHECK-NEXT:  1      1     0.50                        adr	x0, #262144
+# CHECK-NEXT:  1      1     0.50                        tbz	x12, #62, #0
+# CHECK-NEXT:  1      1     0.50                        tbz	x12, #62, #4
+# CHECK-NEXT:  1      1     0.50                        tbz	x12, #62, #-32768
+# CHECK-NEXT:  1      1     0.50                        tbnz	x12, #60, #32764
+# CHECK-NEXT:  1      1     0.50                        b	#4
+# CHECK-NEXT:  1      1     0.50                        b	#-4
+# CHECK-NEXT:  1      1     0.50                        b	#134217724
+# CHECK-NEXT:  1      1     1.00                        br	x20
+# CHECK-NEXT:  2      1     1.00                        blr	xzr
+# CHECK-NEXT:  1      1     0.50                  U     ret	x10
+# CHECK-NEXT:  1      1     0.50                  U     ret
+# CHECK-NEXT:  1      1     1.00                  U     eret
+# CHECK-NEXT:  1      1     1.00                  U     drps
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - Ampere1BUnitA
+# CHECK-NEXT: [0.1] - Ampere1BUnitA
+# CHECK-NEXT: [1.0] - Ampere1BUnitB
+# CHECK-NEXT: [1.1] - Ampere1BUnitB
+# CHECK-NEXT: [2]   - Ampere1BUnitBS
+# CHECK-NEXT: [3.0] - Ampere1BUnitL
+# CHECK-NEXT: [3.1] - Ampere1BUnitL
+# CHECK-NEXT: [4.0] - Ampere1BUnitS
+# CHECK-NEXT: [4.1] - Ampere1BUnitS
+# CHECK-NEXT: [5]   - Ampere1BUnitX
+# CHECK-NEXT: [6]   - Ampere1BUnitY
+# CHECK-NEXT: [7]   - Ampere1BUnitZ
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6]    [7]
+# CHECK-NEXT: 190.00 190.00 211.00 211.00 143.00 130.50 130.50 83.00  83.00  159.00 126.00 150.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6]    [7]    Instructions:
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	w2, w3, #4095
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	w30, w29, #1, lsl #12
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	w13, w5, #4095, lsl #12
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	x5, x7, #1638
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	w20, wsp, #801
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	wsp, wsp, #1104
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	wsp, w30, #4084
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	x0, x24, #291
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	x3, x24, #4095, lsl #12
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	x8, sp, #1074
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	sp, x29, #3816
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	w0, wsp, #4077
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	w4, w20, #546, lsl #12
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	sp, sp, #288
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	wsp, w19, #16
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adds	w13, w23, #291, lsl #12
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cmn	w2, #4095
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adds	w20, wsp, #0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cmn	x3, #1, lsl #12
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cmp	sp, #20, lsl #12
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cmp	x30, #4095
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     subs	x4, sp, #3822
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cmn	w3, #291, lsl #12
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cmn	wsp, #1365
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cmn	sp, #1092, lsl #12
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	sp, x30
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	wsp, w20
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	x11, sp
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	w24, wsp
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	w3, w5, w7
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	wzr, w3, w5
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	w20, wzr, w4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	w4, w6, wzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	w11, w13, w15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	w9, w3, wzr, lsl #10
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	w17, w29, w20, lsl #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	w21, w22, w23, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	w24, w25, w26, lsr #18
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	w27, w28, w29, lsr #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	w2, w3, w4, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	w5, w6, w7, asr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	w8, w9, w10, asr #31
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	x3, x5, x7
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	xzr, x3, x5
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	x20, xzr, x4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	x4, x6, xzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     add	x11, x13, x15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	x9, x3, xzr, lsl #10
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	x17, x29, x20, lsl #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	x21, x22, x23, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	x24, x25, x26, lsr #18
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	x27, x28, x29, lsr #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	x2, x3, x4, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	x5, x6, x7, asr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     add	x8, x9, x10, asr #63
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     adds	w3, w5, w7
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmn	w3, w5
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     adds	w20, wzr, w4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     adds	w4, w6, wzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     adds	w11, w13, w15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	w9, w3, wzr, lsl #10
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	w17, w29, w20, lsl #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	w21, w22, w23, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	w24, w25, w26, lsr #18
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	w27, w28, w29, lsr #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	w2, w3, w4, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	w5, w6, w7, asr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	w8, w9, w10, asr #31
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     adds	x3, x5, x7
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmn	x3, x5
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     adds	x20, xzr, x4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     adds	x4, x6, xzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     adds	x11, x13, x15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	x9, x3, xzr, lsl #10
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	x17, x29, x20, lsl #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	x21, x22, x23, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	x24, x25, x26, lsr #18
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	x27, x28, x29, lsr #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	x2, x3, x4, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	x5, x6, x7, asr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     adds	x8, x9, x10, asr #63
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	w3, w5, w7
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	wzr, w3, w5
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	w4, w6, wzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	w11, w13, w15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	w9, w3, wzr, lsl #10
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	w17, w29, w20, lsl #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	w21, w22, w23, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	w24, w25, w26, lsr #18
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	w27, w28, w29, lsr #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	w2, w3, w4, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	w5, w6, w7, asr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	w8, w9, w10, asr #31
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	x3, x5, x7
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	xzr, x3, x5
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	x4, x6, xzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     sub	x11, x13, x15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	x9, x3, xzr, lsl #10
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	x17, x29, x20, lsl #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	x21, x22, x23, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	x24, x25, x26, lsr #18
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	x27, x28, x29, lsr #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	x2, x3, x4, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	x5, x6, x7, asr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     sub	x8, x9, x10, asr #63
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subs	w3, w5, w7
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	w3, w5
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subs	w4, w6, wzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subs	w11, w13, w15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	w9, w3, wzr, lsl #10
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	w17, w29, w20, lsl #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	w21, w22, w23, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	w24, w25, w26, lsr #18
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	w27, w28, w29, lsr #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	w2, w3, w4, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	w5, w6, w7, asr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	w8, w9, w10, asr #31
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subs	x3, x5, x7
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	x3, x5
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subs	x4, x6, xzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subs	x11, x13, x15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	x9, x3, xzr, lsl #10
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	x17, x29, x20, lsl #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	x21, x22, x23, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	x24, x25, x26, lsr #18
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	x27, x28, x29, lsr #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	x2, x3, x4, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	x5, x6, x7, asr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     subs	x8, x9, x10, asr #63
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmn	wzr, w4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmn	w5, wzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmn	w6, w7
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	w8, w9, lsl #15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	w10, w11, lsl #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	w12, w13, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	w14, w15, lsr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	w16, w17, lsr #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	w18, w19, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	w20, w21, asr #22
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	w22, w23, asr #31
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmn	x0, x3
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmn	xzr, x4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmn	x5, xzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmn	x6, x7
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	x8, x9, lsl #15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	x10, x11, lsl #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	x12, x13, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	x14, x15, lsr #41
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	x16, x17, lsr #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	x18, x19, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	x20, x21, asr #55
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmn	x22, x23, asr #63
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	w0, w3
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	wzr, w4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	w5, wzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	w6, w7
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	w8, w9, lsl #15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	w10, w11, lsl #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	w12, w13, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	w14, w15, lsr #21
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	w18, w19, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	w20, w21, asr #22
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	w22, w23, asr #31
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	x0, x3
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	xzr, x4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	x5, xzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	x6, x7
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	x8, x9, lsl #15
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	x10, x11, lsl #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	x12, x13, lsr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	x14, x15, lsr #41
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	x16, x17, lsr #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	x18, x19, asr #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	x20, x21, asr #55
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     cmp	x22, x23, asr #63
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	wzr, w0
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     cmp	xzr, x0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adc	w29, w27, w25
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adc	wzr, w3, w4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adc	w9, wzr, w10
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adc	w20, w0, wzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adc	x29, x27, x25
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adc	xzr, x3, x4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adc	x9, xzr, x10
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adc	x20, x0, xzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adcs	w29, w27, w25
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adcs	wzr, w3, w4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adcs	w9, wzr, w10
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adcs	w20, w0, wzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adcs	x29, x27, x25
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adcs	xzr, x3, x4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adcs	x9, xzr, x10
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adcs	x20, x0, xzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbc	w29, w27, w25
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbc	wzr, w3, w4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngc	w9, w10
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbc	w20, w0, wzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbc	x29, x27, x25
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbc	xzr, x3, x4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngc	x9, x10
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbc	x20, x0, xzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbcs	w29, w27, w25
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbcs	wzr, w3, w4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngcs	w9, w10
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbcs	w20, w0, wzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbcs	x29, x27, x25
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbcs	xzr, x3, x4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngcs	x9, x10
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     sbcs	x20, x0, xzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngc	w3, w12
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngc	wzr, w9
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngc	w23, wzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngc	x29, x30
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngc	xzr, x0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngc	x0, xzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngcs	w3, w12
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngcs	wzr, w9
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngcs	w23, wzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngcs	x29, x30
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngcs	xzr, x0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ngcs	x0, xzr
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfx	x1, x2, #3, #2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	x3, x4, #63
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	wzr, wzr, #31
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfx	w12, w9, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ubfiz	x4, x5, #52, #11
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ubfx	xzr, x4, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ubfiz	x4, xzr, #1, #6
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	x5, x6, #12
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfi	x4, x5, #52, #11
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	xzr, x4, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfc	x4, #1, #6
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	x5, x6, #12, #52
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sxtb	w1, w2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sxtb	xzr, w3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sxth	w9, w10
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sxth	x0, w1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sxtw	x3, w30
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     uxtb	w1, w2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     uxth	w9, w10
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ubfx	x3, x30, #0, #32
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	w3, w2, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	w9, w10, #31
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	x20, x21, #63
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	w1, wzr, #3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	w3, w2, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	w9, w10, #31
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	x20, x21, #63
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	wzr, wzr, #3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	w3, w2, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	w9, w10, #31
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	x20, x21, #63
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	w1, wzr, #3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfx	w9, w10, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfiz	x2, x3, #63, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	x19, x20, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfiz	x9, x10, #5, #59
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	w9, w10, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfiz	w11, w12, #31, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfiz	w13, w14, #29, #3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfiz	xzr, xzr, #10, #11
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfx	w9, w10, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	x2, x3, #63
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	x19, x20, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	x9, x10, #5
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	w9, w10, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	w11, w12, #31
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	w13, w14, #29
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     sbfx	xzr, xzr, #10, #11
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	w9, w10, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfi	x2, x3, #63, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	x19, x20, #0, #64
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfi	x9, x10, #5, #59
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	w9, w10, #0, #32
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfi	w11, w12, #31, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfi	w13, w14, #29, #3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfc	xzr, #10, #11
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	w9, w10, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	x2, x3, #63, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	x19, x20, #0, #64
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	x9, x10, #5, #59
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	w9, w10, #0, #32
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	w11, w12, #31, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	w13, w14, #29, #3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     bfxil	xzr, xzr, #10, #11
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ubfx	w9, w10, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	x2, x3, #63
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	x19, x20, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	x9, x10, #5
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	w9, w10, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	w11, w12, #31
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	w13, w14, #29
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ubfiz	xzr, xzr, #10, #11
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ubfx	w9, w10, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	x2, x3, #63
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	x19, x20, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	x9, x10, #5
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	w9, w10, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	w11, w12, #31
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	w13, w14, #29
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ubfx	xzr, xzr, #10, #11
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cbz	w5, #4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cbz	x5, #0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cbnz	x2, #-4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cbnz	x26, #1048572
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cbz	wzr, #0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cbnz	xzr, #0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     b.ne	#4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     b.ge	#1048572
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     b.ge	#-4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	w1, #31, #0, eq
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	w3, #0, #15, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	wzr, #15, #13, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	x9, #31, #0, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	x3, #0, #15, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	xzr, #5, #7, ne
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	w1, #31, #0, eq
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	w3, #0, #15, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	wzr, #15, #13, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	x9, #31, #0, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	x3, #0, #15, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	xzr, #5, #7, ne
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	w1, wzr, #0, eq
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	w3, w0, #15, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	wzr, w15, #13, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	x9, xzr, #0, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	x3, x0, #15, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmp	xzr, x5, #7, ne
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	w1, wzr, #0, eq
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	w3, w0, #15, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	wzr, w15, #13, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	x9, xzr, #0, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	x3, x0, #15, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ccmn	xzr, x5, #7, ne
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csel	w1, w0, w19, ne
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csel	wzr, w5, w9, eq
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csel	w9, wzr, w30, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csel	w1, w28, wzr, mi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csel	x19, x23, x29, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csel	xzr, x3, x4, ge
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csel	x5, xzr, x6, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csel	x7, x8, xzr, lo
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	w1, w0, w19, ne
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	wzr, w5, w9, eq
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	w9, wzr, w30, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	w1, w28, wzr, mi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	x19, x23, x29, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	xzr, x3, x4, ge
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	x5, xzr, x6, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	x7, x8, xzr, lo
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	w1, w0, w19, ne
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	wzr, w5, w9, eq
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	w9, wzr, w30, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	w1, w28, wzr, mi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	x19, x23, x29, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	xzr, x3, x4, ge
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	x5, xzr, x6, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	x7, x8, xzr, lo
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csneg	w1, w0, w19, ne
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csneg	wzr, w5, w9, eq
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csneg	w9, wzr, w30, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csneg	w1, w28, wzr, mi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csneg	x19, x23, x29, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csneg	xzr, x3, x4, ge
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csneg	x5, xzr, x6, hs
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csneg	x7, x8, xzr, lo
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cset	w3, eq
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cset	x9, pl
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csetm	w20, ne
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csetm	x30, ge
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	w2, wzr, wzr, al
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	x3, xzr, xzr, nv
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cinc	w3, w5, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cinc	wzr, w4, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cset	w9, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cinc	x3, x5, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cinc	xzr, x4, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cset	x9, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	w5, w6, w6, nv
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinc	x1, x2, x2, al
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cinv	w3, w5, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cinv	wzr, w4, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csetm	w9, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cinv	x3, x5, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cinv	xzr, x4, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csetm	x9, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	x1, x0, x0, al
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	w9, w8, w8, nv
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cneg	w3, w5, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cneg	wzr, w4, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cneg	w9, wzr, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cneg	x3, x5, gt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cneg	xzr, x4, le
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     cneg	x9, xzr, lt
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csneg	x4, x8, x8, al
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     csinv	w9, w8, w8, nv
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     rbit	w0, w7
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     rbit	x18, x3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     rev16	w17, w1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     rev16	x5, x2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     rev	w18, w0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     rev32	x20, x1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     rev	x22, x2
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     clz	w24, w3
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     clz	x26, x4
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     cls	w3, w5
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     cls	x20, x5
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00    -      -     udiv	w0, w7, w10
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     1.00    -      -     udiv	x9, x22, x4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00    -      -     sdiv	w12, w21, w0
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     1.00    -      -     sdiv	x13, x2, x1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	w11, w12, w13
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	x14, x15, x16
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	w17, w18, w19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	x20, x21, x22
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	w23, w24, w25
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	x26, x27, x28
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ror	w0, w1, w2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ror	x3, x4, x5
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	w6, w7, w8
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsl	x9, x10, x11
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	w12, w13, w14
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     lsr	x15, x16, x17
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	w18, w19, w20
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     asr	x21, x22, x23
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ror	w24, w25, w26
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ror	x27, x28, x29
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     smulh	x30, x29, x28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     smulh	xzr, x27, x26
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     umulh	x30, x29, x28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     umulh	x23, x30, xzr
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     madd	w1, w3, w7, w4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     madd	wzr, w0, w9, w11
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     madd	w13, wzr, w4, w4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     madd	w19, w30, wzr, w29
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     mul	w4, w5, w6
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     madd	x1, x3, x7, x4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     madd	xzr, x0, x9, x11
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     madd	x13, xzr, x4, x4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     madd	x19, x30, xzr, x29
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     mul	x4, x5, x6
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     msub	w1, w3, w7, w4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     msub	wzr, w0, w9, w11
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     msub	w13, wzr, w4, w4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     msub	w19, w30, wzr, w29
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     mneg	w4, w5, w6
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     msub	x1, x3, x7, x4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     msub	xzr, x0, x9, x11
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     msub	x13, xzr, x4, x4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     msub	x19, x30, xzr, x29
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     mneg	x4, x5, x6
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smaddl	x3, w5, w2, x9
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smaddl	xzr, w10, w11, x12
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smaddl	x13, wzr, w14, x15
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smaddl	x16, w17, wzr, x18
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smull	x19, w20, w21
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smsubl	x3, w5, w2, x9
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smsubl	xzr, w10, w11, x12
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smsubl	x13, wzr, w14, x15
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smsubl	x16, w17, wzr, x18
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smnegl	x19, w20, w21
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umaddl	x3, w5, w2, x9
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umaddl	xzr, w10, w11, x12
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umaddl	x13, wzr, w14, x15
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umaddl	x16, w17, wzr, x18
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umull	x19, w20, w21
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umsubl	x3, w5, w2, x9
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umsubl	x16, w17, wzr, x18
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umnegl	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     smulh	x30, x29, x28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     smulh	x23, x22, xzr
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     umulh	x23, x22, xzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     mul	x19, x20, xzr
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     mneg	w21, w22, w23
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smull	x11, w13, w17
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umull	x11, w13, w17
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     smnegl	x11, w13, w17
+# CHECK-NEXT: 0.25   0.25   0.25   0.25   1.00    -      -      -      -      -      -      -     umnegl	x11, w13, w17
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     extr	w3, w5, w7, #0
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     extr	w11, w13, w17, #31
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     extr	x3, x5, x7, #15
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     extr	x11, x13, x17, #63
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ror	x19, x23, #24
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ror	x29, xzr, #63
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ror	w9, w13, #31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmp	s3, s5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmp	s31, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmp	s31, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmpe	s29, s30
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmpe	s15, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmpe	s15, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmp	d4, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmp	d23, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmp	d23, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmpe	d26, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmpe	d29, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fcmpe	d29, #0.0
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmp	s1, s31, #0, eq
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmp	s3, s0, #15, hs
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmp	s31, s15, #13, hs
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmp	d9, d31, #0, le
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmp	d3, d0, #15, gt
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmp	d31, d5, #7, ne
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmpe	s1, s31, #0, eq
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmpe	s3, s0, #15, hs
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmpe	s31, s15, #13, hs
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmpe	d9, d31, #0, le
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmpe	d3, d0, #15, gt
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     1.00    -      -     fccmpe	d31, d5, #7, ne
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     0.50   0.50    -     fcsel	s3, s20, s9, pl
+# CHECK-NEXT: 0.50   0.50    -      -     1.00    -      -      -      -     0.50   0.50    -     fcsel	d9, d10, d11, mi
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	s0, s1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabs	s2, s3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fneg	s4, s5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fsqrt	s6, s7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvt	d8, s9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvt	h10, s11
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintn	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintp	s14, s15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintm	s16, s17
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintz	s18, s19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinta	s20, s21
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintx	s22, s23
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinti	s24, s25
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	d0, d1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabs	d2, d3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fneg	d4, d5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fsqrt	d6, d7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvt	s8, d9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvt	h10, d11
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintn	d12, d13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintp	d14, d15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintm	d16, d17
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintz	d18, d19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinta	d20, d21
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintx	d22, d23
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinti	d24, d25
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvt	s26, h27
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvt	d28, h29
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmul	s20, s19, s17
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fdiv	s1, s2, s3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fadd	s4, s5, s6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fsub	s7, s8, s9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmax	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmin	s13, s14, s15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxnm	s16, s17, s18
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminnm	s19, s20, s21
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fnmul	s22, s23, s2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmul	d20, d19, d17
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fdiv	d1, d2, d3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fadd	d4, d5, d6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fsub	d7, d8, d9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmax	d10, d11, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmin	d13, d14, d15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxnm	d16, d17, d18
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminnm	d19, d20, d21
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fnmul	d22, d23, d24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmadd	s3, s5, s6, s31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmadd	d3, d13, d0, d23
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmsub	s3, s5, s6, s31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmsub	d3, d13, d0, d23
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fnmadd	s3, s5, s6, s31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fnmadd	d3, d13, d0, d23
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fnmsub	s3, s5, s6, s31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fnmsub	d3, d13, d0, d23
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	w3, h5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	wzr, h20, #13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	w19, h0, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x3, h5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x12, h30, #45
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x19, h0, #64
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	w3, s5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	wzr, s20, #13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	w19, s0, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x3, s5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x12, s30, #45
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x19, s0, #64
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	w3, d5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	wzr, d20, #13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	w19, d0, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x3, d5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x12, d30, #45
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x19, d0, #64
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	w3, h5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	wzr, h20, #13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	w19, h0, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x3, h5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x12, h30, #45
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x19, h0, #64
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	w3, s5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	wzr, s20, #13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	w19, s0, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x3, s5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x12, s30, #45
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x19, s0, #64
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	w3, d5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	wzr, d20, #13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	w19, d0, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x3, d5, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x12, d30, #45
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x19, d0, #64
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	h23, w19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	h31, wzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	h14, w0, #32
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	h23, x19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	h31, xzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	h14, x0, #64
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	s23, w19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	s31, wzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	s14, w0, #32
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	s23, x19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	s31, xzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	s14, x0, #64
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	d23, w19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	d31, wzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	d14, w0, #32
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	d23, x19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	d31, xzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	d14, x0, #64
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	h23, w19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	h31, wzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	h14, w0, #32
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	h23, x19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	h31, xzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	h14, x0, #64
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	s23, w19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	s31, wzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	s14, w0, #32
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	s23, x19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	s31, xzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	s14, x0, #64
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	d23, w19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	d31, wzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	d14, w0, #32
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	d23, x19, #1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	d31, xzr, #20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	d14, x0, #64
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtns	w3, h31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtns	xzr, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtnu	wzr, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtnu	x0, h0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtps	wzr, h9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtps	x12, h20
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtpu	w30, h23
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtpu	x29, h3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtms	w2, h3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtms	x4, h5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtmu	w6, h7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtmu	x8, h9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	w10, h11
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x12, h13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	w14, h15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x15, h16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	h17, w18
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	h19, x20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	h21, w22
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	h23, x24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtas	w25, h26
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtas	x27, h28
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtau	w29, h30
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtau	xzr, h0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtns	w3, s31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtns	xzr, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtnu	wzr, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtnu	x0, s0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtps	wzr, s9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtps	x12, s20
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtpu	w30, s23
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtpu	x29, s3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtms	w2, s3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtms	x4, s5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtmu	w6, s7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtmu	x8, s9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	w10, s11
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x12, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	w14, s15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x15, s16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	s17, w18
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	s19, x20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	s21, w22
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	s23, x24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtas	w25, s26
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtas	x27, s28
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtau	w29, s30
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtau	xzr, s0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtns	w3, d31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtns	xzr, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtnu	wzr, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtnu	x0, d0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtps	wzr, d9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtps	x12, d20
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtpu	w30, d23
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtpu	x29, d3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtms	w2, d3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtms	x4, d5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtmu	w6, d7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtmu	x8, d9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	w10, d11
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzs	x12, d13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	w14, d15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtzu	x15, d16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	d17, w18
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     scvtf	d19, x20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	d21, w22
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     1.00   1.00    -     ucvtf	d23, x24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtas	w25, d26
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtas	x27, d28
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtau	w29, d30
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   fcvtau	xzr, d0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     fmov	w3, s9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00   fmov	s9, w3
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     fmov	x20, d31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00   fmov	d1, x15
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     0.50   0.50    -     fmov	x3, v12.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fmov	v1.d[1], x19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	s2, #0.12500000
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	s3, #1.00000000
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	d30, #16.00000000
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	s4, #1.06250000
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	d10, #1.93750000
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	s12, #-1.00000000
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	d16, #8.50000000
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w3, #0
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x29, #4
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsw	xzr, #-4
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	s0, #8
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	d0, #1048572
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	q0, #-1048576
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     prfm	pldl1strm, #0
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     prfm	#22, #0
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stxrb	w18, w8, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stxrh	w24, w15, [x16]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stxr	w5, w6, [x17]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stxr	w1, x10, [x21]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldxrb	w30, [x0]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldxrh	w17, [x4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldxr	w22, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldxr	x11, [x29]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldxr	x11, [x29]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldxr	x11, [x29]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stxp	w12, w11, w10, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stxp	wzr, x27, x9, [x12]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldxp	w0, wzr, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldxp	x17, x0, [x18]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldxp	x17, x0, [x18]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stlxrb	w12, w22, [x0]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stlxrh	w10, w1, [x1]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stlxr	w9, w2, [x2]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stlxr	w9, x3, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldaxrb	w8, [x4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldaxrh	w7, [x5]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldaxr	w6, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldaxr	x5, [x6]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldaxr	x5, [x6]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldaxr	x5, [x6]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stlxp	w4, w5, w6, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -     stlxp	wzr, x6, x7, [x1]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldaxp	w5, w18, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldaxp	x6, x19, [x22]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldaxp	x6, x19, [x22]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stlrb	w24, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stlrh	w25, [x30]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stlr	w26, [x29]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stlr	x27, [x28]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stlr	x27, [x28]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stlr	x27, [x28]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldarb	w23, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldarh	w22, [x30]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldar	wzr, [x29]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldar	x21, [x28]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldar	x21, [x28]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldar	x21, [x28]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     sturb	w9, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     sturh	wzr, [x12, #255]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stur	w16, [x0, #-256]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stur	x28, [x14, #1]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldurb	w1, [x20, #255]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldurh	w20, [x1, #255]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldur	w12, [sp, #255]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldur	xzr, [x12, #255]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldursb	x9, [x7, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldursh	x17, [x19, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldursw	x20, [x15, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     prfum	pldl2keep, [sp, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldursb	w19, [x1, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldursh	w15, [x21, #-256]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   stur	b0, [sp, #1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   stur	h12, [x12, #-1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   stur	s15, [x0, #255]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   stur	d31, [x5, #25]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   stur	q9, [x5]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldur	b3, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldur	h5, [x4, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldur	s7, [x12, #-1]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldur	d11, [x19, #4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldur	q13, [x1, #2]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strb	w9, [x2], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strb	w10, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strb	w10, [x3], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strh	w9, [x2], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strh	w9, [x2], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strh	w10, [x3], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	w19, [sp], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	w20, [x30], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	w21, [x12], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	xzr, [x9], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	x2, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	x19, [x12], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrb	w9, [x2], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrb	w10, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrb	w10, [x3], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrh	w9, [x2], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrh	w9, [x2], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrh	w10, [x3], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	w19, [sp], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	w20, [x30], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	w21, [x12], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	xzr, [x9], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	x2, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	x19, [x12], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	xzr, [x9], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	x2, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	x19, [x12], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	xzr, [x9], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	x2, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	x19, [x12], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsw	xzr, [x9], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsw	x2, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsw	x19, [x12], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	wzr, [x9], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	w2, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	w19, [x12], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	wzr, [x9], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	w2, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	w19, [x12], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	b0, [x0], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	b3, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	b5, [sp], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	h10, [x10], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	h13, [x23], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	h15, [sp], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	s20, [x20], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	s23, [x23], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	s25, [x0], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	d20, [x20], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	d23, [x23], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	d25, [x0], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	b0, [x0], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	b3, [x3], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	b5, [sp], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	h10, [x10], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	h13, [x23], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	h15, [sp], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	s20, [x20], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	s23, [x23], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	s25, [x0], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	d20, [x20], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	d23, [x23], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	d25, [x0], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	q20, [x1], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	q23, [x9], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	q25, [x20], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	q10, [x1], #255
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	q22, [sp], #1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	q21, [x20], #-256
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	x3, [x4, #0]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strb	w9, [x2, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strb	w10, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strb	w10, [x3, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strh	w9, [x2, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strh	w9, [x2, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     strh	w10, [x3, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	w19, [sp, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	w20, [x30, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	w21, [x12, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	xzr, [x9, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	x2, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	x19, [x12, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrb	w9, [x2, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrb	w10, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrb	w10, [x3, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrh	w9, [x2, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrh	w9, [x2, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrh	w10, [x3, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	w19, [sp, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	w20, [x30, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	w21, [x12, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	xzr, [x9, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	x2, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	x19, [x12, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	xzr, [x9, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	x2, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	x19, [x12, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	xzr, [x9, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	x2, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	x19, [x12, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsw	xzr, [x9, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsw	x2, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsw	x19, [x12, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	wzr, [x9, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	w2, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsb	w19, [x12, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	wzr, [x9, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	w2, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldrsh	w19, [x12, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	b0, [x0, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	b3, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	b5, [sp, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	h10, [x10, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	h13, [x23, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	h15, [sp, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	s20, [x20, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	s23, [x23, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	s25, [x0, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	d20, [x20, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	d23, [x23, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	d25, [x0, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	b0, [x0, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	b3, [x3, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	b5, [sp, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	h10, [x10, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	h13, [x23, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	h15, [sp, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	s20, [x20, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	s23, [x23, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	s25, [x0, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	d20, [x20, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	d23, [x23, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	d25, [x0, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	q20, [x1, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	q23, [x9, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -     0.50   0.50    -      -      -      -      -     ldr	q25, [x20, #-256]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	q10, [x1, #255]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	q22, [sp, #1]!
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     str	q21, [x20, #-256]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     sttrb	w9, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     sttrh	wzr, [x12, #255]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     sttr	w16, [x0, #-256]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     sttr	x28, [x14, #1]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldtrb	w1, [x20, #255]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldtrh	w20, [x1, #255]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldtr	w12, [sp, #255]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldtr	xzr, [x12, #255]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldtrsb	x9, [x7, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldtrsh	x17, [x19, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldtrsw	x20, [x15, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldtrsb	w19, [x1, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldtrsh	w15, [x21, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x4, [x29]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x30, [x12, #32760]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x20, [sp, #8]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	xzr, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w2, [sp]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w17, [sp, #16380]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w13, [x2, #4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsw	x2, [x5, #4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsw	x23, [sp, #16380]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrh	w2, [x4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsh	w23, [x6, #8190]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsh	wzr, [sp, #2]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsh	x29, [x2, #2]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrb	w26, [x3, #121]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrb	w12, [x2]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsb	w27, [sp, #4095]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsb	xzr, [x15]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     str	x30, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     str	w20, [x4, #16380]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     strh	w17, [sp, #8190]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     strb	w23, [x3, #4095]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     strb	wzr, [x2]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	b31, [sp, #4095]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	h20, [x2, #8190]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	s10, [x19, #16380]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	d3, [x10, #32760]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   str	q12, [sp, #65520]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrb	w3, [sp, x5]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrb	w9, [x27, x6]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrsb	w10, [x30, x7]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrb	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     strb	w12, [x28, xzr, sxtx]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrb	w14, [x26, w6, uxtw]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrsb	w15, [x25, w7, uxtw]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrb	w17, [x23, w9, sxtw]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrsb	x18, [x22, w10, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsh	w3, [sp, x5]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsh	w9, [x27, x6]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrh	w10, [x30, x7, lsl #1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     strh	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrh	w12, [x28, xzr, sxtx]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsh	x13, [x27, x5, sxtx #1]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrh	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrh	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrsh	w16, [x24, w8, uxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrh	w17, [x23, w9, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldrh	w18, [x22, w10, sxtw]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     strh	w19, [x21, wzr, sxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w3, [sp, x5]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	s9, [x27, x6]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w10, [x30, x7, lsl #2]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   str	s12, [x28, xzr, sxtx]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     str	w13, [x27, x5, sxtx #2]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     str	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w16, [x24, w8, uxtw #2]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrsw	x17, [x23, w9, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	w18, [x22, w10, sxtw]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -     0.50   0.50    -      -      -     ldrsw	x19, [x21, wzr, sxtw #2]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x3, [sp, x5]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     str	x9, [x27, x6]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	d10, [x30, x7, lsl #3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     str	x11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x12, [x28, xzr, sxtx]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x13, [x27, x5, sxtx #3]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     prfm	pldl1keep, [x26, w6, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x15, [x25, w7, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x16, [x24, w8, uxtw #3]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x17, [x23, w9, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	x18, [x22, w10, sxtw]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   str	d19, [x21, wzr, sxtw #3]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	q3, [sp, x5]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	q9, [x27, x6]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	q10, [x30, x7, lsl #4]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   str	q11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   str	q12, [x28, xzr, sxtx]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   str	q13, [x27, x5, sxtx #4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	q14, [x26, w6, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	q15, [x25, w7, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	q16, [x24, w8, uxtw #4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	q17, [x23, w9, sxtw]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   str	q18, [x22, w10, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldr	q19, [x21, wzr, sxtw #4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	w3, w5, [sp]
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -     0.50   0.50    -      -      -     stp	wzr, w9, [sp, #252]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	w2, wzr, [sp, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	w9, w10, [sp, #4]
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldpsw	x9, x10, [sp, #4]
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldpsw	x9, x10, [x2, #-256]
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldpsw	x20, x30, [sp, #252]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	x21, x29, [x2, #504]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	x22, x23, [x3, #-512]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	x24, x25, [x4, #8]
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldp	s29, s28, [sp, #252]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	s27, s26, [sp, #-256]
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldp	s1, s2, [x3, #44]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	d3, d5, [x9, #504]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	d7, d11, [x10, #-512]
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldp	d2, d3, [x30, #-8]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	q3, q5, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	q17, q19, [sp, #1008]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -     ldp	q23, q29, [x1, #-1024]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	w3, w5, [sp], #0
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -     0.50   0.50    -      -      -     stp	wzr, w9, [sp], #252
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	w2, wzr, [sp], #-256
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	w9, w10, [sp], #4
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldpsw	x9, x10, [sp], #4
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldpsw	x9, x10, [x2], #-256
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldpsw	x20, x30, [sp], #252
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	x21, x29, [x2], #504
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	x22, x23, [x3], #-512
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	x24, x25, [x4], #8
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldp	s29, s28, [sp], #252
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	s27, s26, [sp], #-256
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldp	s1, s2, [x3], #44
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	d3, d5, [x9], #504
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	d7, d11, [x10], #-512
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldp	d2, d3, [x30], #-8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	q3, q5, [sp], #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	q17, q19, [sp], #1008
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -     ldp	q23, q29, [x1], #-1024
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	w3, w5, [sp, #0]!
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -     0.50   0.50    -      -      -     stp	wzr, w9, [sp, #252]!
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	w2, wzr, [sp, #-256]!
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	w9, w10, [sp, #4]!
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldpsw	x9, x10, [sp, #4]!
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldpsw	x9, x10, [x2, #-256]!
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldpsw	x20, x30, [sp, #252]!
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	x21, x29, [x2, #504]!
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	x22, x23, [x3, #-512]!
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldp	x24, x25, [x4, #8]!
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldp	s29, s28, [sp, #252]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	s27, s26, [sp, #-256]!
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldp	s1, s2, [x3, #44]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	d3, d5, [x9, #504]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	d7, d11, [x10, #-512]!
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldp	d2, d3, [x30, #-8]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	q3, q5, [sp, #0]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stp	q17, q19, [sp, #1008]!
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -     ldp	q23, q29, [x1, #-1024]!
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldnp	w3, w5, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stnp	wzr, w9, [sp, #252]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldnp	w2, wzr, [sp, #-256]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldnp	w9, w10, [sp, #4]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldnp	x21, x29, [x2, #504]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldnp	x22, x23, [x3, #-512]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ldnp	x24, x25, [x4, #8]
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldnp	s29, s28, [sp, #252]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stnp	s27, s26, [sp, #-256]
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldnp	s1, s2, [x3, #44]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stnp	d3, d5, [x9, #504]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stnp	d7, d11, [x10, #-512]
+# CHECK-NEXT:  -      -      -      -     1.00   0.50   0.50    -      -      -      -      -     ldnp	d2, d3, [x30, #-8]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stnp	q3, q5, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   stnp	q17, q19, [sp, #1008]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -     ldnp	q23, q29, [x1, #-1024]
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	w3, #983055
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	x10, #-6148914691236517206
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     and	w12, w23, w21
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     and	w16, w15, w1, lsl #1
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     and	w9, w4, w10, lsl #31
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     and	w3, w30, w11
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     and	x3, x5, x7, lsl #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     and	x5, x14, x19, asr #4
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     and	w3, w17, w19, ror #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     and	w0, w2, wzr, lsr #17
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     and	w3, w30, w11, asr #2
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     and	xzr, x4, x26
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     and	w3, wzr, w20, ror #2
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     and	x7, x20, xzr, asr #63
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     bic	x13, x20, x14, lsl #47
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     bic	w2, w7, w9
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     orr	w2, w7, w0, asr #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     orr	x8, x9, x10, lsl #12
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     orn	x3, x5, x7, asr #2
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     orn	w2, w5, w29
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     ands	w7, wzr, w9, lsl #1
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     ands	x3, x5, x20, ror #63
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     bics	w3, w5, w7
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     bics	x3, xzr, x3, lsl #1
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     tst	w3, w7, lsl #31
+# CHECK-NEXT: 0.25   0.25   0.75   0.75    -      -      -      -      -      -      -      -     tst	x2, x20, asr #2
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	x3, x6
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	x3, xzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	wzr, w2
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	w3, w5
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     movz	w2, #0, lsl #16
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	w2, #-1235
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	x2, #5299989643264
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     mov	x2, #0
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     movk	w3, #0
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     movz	x4, #0, lsl #16
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     movk	w5, #0, lsl #16
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     movz	x6, #0, lsl #32
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     movk	x7, #0, lsl #32
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     movz	x8, #0, lsl #48
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     movk	x9, #0, lsl #48
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adr	x2, #1600
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adrp	x21, #6553600
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     adr	x0, #262144
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     tbz	x12, #62, #0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     tbz	x12, #62, #4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     tbz	x12, #62, #-32768
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     tbnz	x12, #60, #32764
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     b	#4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     b	#-4
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     b	#134217724
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -     br	x20
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -     blr	xzr
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ret	x10
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     ret
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -     eret
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -     drps
diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/cssc-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/cssc-instructions.s
new file mode 100644
index 00000000000000..a19a106f4b47ec
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/cssc-instructions.s
@@ -0,0 +1,76 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=ampere1b -instruction-tables < %s | FileCheck %s
+
+abs	w1, w2
+abs	x2, x3
+cnt	w3, w4
+cnt	x4, x5
+ctz	w5, w6
+ctz	x6, x7
+smax	w7, w8, w9
+smax	x8, x9, x10
+umax	w9, w10, w11
+umax	x10, x11, x12
+smin	w11, w12, w13
+smin	w12, w13, w14
+umin	w13, w14, w15
+umin	x14, x15, x16
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.25                        abs	w1, w2
+# CHECK-NEXT:  1      1     0.25                        abs	x2, x3
+# CHECK-NEXT:  1      3     1.00                        cnt	w3, w4
+# CHECK-NEXT:  1      3     1.00                        cnt	x4, x5
+# CHECK-NEXT:  1      1     0.50                        ctz	w5, w6
+# CHECK-NEXT:  1      1     0.50                        ctz	x6, x7
+# CHECK-NEXT:  2      1     0.50                        smax	w7, w8, w9
+# CHECK-NEXT:  2      1     0.50                        smax	x8, x9, x10
+# CHECK-NEXT:  2      1     0.50                        umax	w9, w10, w11
+# CHECK-NEXT:  2      1     0.50                        umax	x10, x11, x12
+# CHECK-NEXT:  2      1     0.50                        smin	w11, w12, w13
+# CHECK-NEXT:  2      1     0.50                        smin	w12, w13, w14
+# CHECK-NEXT:  2      1     0.50                        umin	w13, w14, w15
+# CHECK-NEXT:  2      1     0.50                        umin	x14, x15, x16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - Ampere1BUnitA
+# CHECK-NEXT: [0.1] - Ampere1BUnitA
+# CHECK-NEXT: [1.0] - Ampere1BUnitB
+# CHECK-NEXT: [1.1] - Ampere1BUnitB
+# CHECK-NEXT: [2]   - Ampere1BUnitBS
+# CHECK-NEXT: [3.0] - Ampere1BUnitL
+# CHECK-NEXT: [3.1] - Ampere1BUnitL
+# CHECK-NEXT: [4.0] - Ampere1BUnitS
+# CHECK-NEXT: [4.1] - Ampere1BUnitS
+# CHECK-NEXT: [5]   - Ampere1BUnitX
+# CHECK-NEXT: [6]   - Ampere1BUnitY
+# CHECK-NEXT: [7]   - Ampere1BUnitZ
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6]    [7]
+# CHECK-NEXT: 6.50   6.50   3.50   3.50   2.00    -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6]    [7]    Instructions:
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     abs	w1, w2
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     abs	x2, x3
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     cnt	w3, w4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     cnt	x4, x5
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ctz	w5, w6
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     ctz	x6, x7
+# CHECK-NEXT: 0.75   0.75   0.25   0.25    -      -      -      -      -      -      -      -     smax	w7, w8, w9
+# CHECK-NEXT: 0.75   0.75   0.25   0.25    -      -      -      -      -      -      -      -     smax	x8, x9, x10
+# CHECK-NEXT: 0.75   0.75   0.25   0.25    -      -      -      -      -      -      -      -     umax	w9, w10, w11
+# CHECK-NEXT: 0.75   0.75   0.25   0.25    -      -      -      -      -      -      -      -     umax	x10, x11, x12
+# CHECK-NEXT: 0.75   0.75   0.25   0.25    -      -      -      -      -      -      -      -     smin	w11, w12, w13
+# CHECK-NEXT: 0.75   0.75   0.25   0.25    -      -      -      -      -      -      -      -     smin	w12, w13, w14
+# CHECK-NEXT: 0.75   0.75   0.25   0.25    -      -      -      -      -      -      -      -     umin	w13, w14, w15
+# CHECK-NEXT: 0.75   0.75   0.25   0.25    -      -      -      -      -      -      -      -     umin	x14, x15, x16
diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s
new file mode 100644
index 00000000000000..5148522431edbf
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/mte-instructions.s
@@ -0,0 +1,349 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=ampere1b -instruction-tables < %s | FileCheck %s
+
+irg	x0, x1
+irg	sp, x1
+irg	x0, sp
+irg	x0, x1, x2
+irg	sp, x1, x2
+addg	x0, x1, #0, #1
+addg	sp, x2, #32, #3
+addg	x0, sp, #64, #5
+addg	x3, x4, #1008, #6
+addg	x5, x6, #112, #15
+subg	x0, x1, #0, #1
+subg	sp, x2, #32, #3
+subg	x0, sp, #64, #5
+subg	x3, x4, #1008, #6
+subg	x5, x6, #112, #15
+gmi	x0, x1, x2
+gmi	x3, sp, x4
+gmi	xzr, x0, x30
+gmi	x30, x0, xzr
+subp	x0, x1, x2
+subps	x0, x1, x2
+subp	x0, sp, sp
+subps	x0, sp, sp
+subps	xzr, x0, x1
+subps	xzr, sp, sp
+stg	x0, [x1, #-4096]
+stg	x1, [x2, #4080]
+stg	x2, [sp, #16]
+stg	x3, [x1]
+stg	sp, [x1]
+stzg	x0, [x1, #-4096]
+stzg	x1, [x2, #4080]
+stzg	x2, [sp, #16]
+stzg	x3, [x1]
+stzg	sp, [x1]
+stg	x0, [x1, #-4096]!
+stg	x1, [x2, #4080]!
+stg	x2, [sp, #16]!
+stg	sp, [sp, #16]!
+stzg	x0, [x1, #-4096]!
+stzg	x1, [x2, #4080]!
+stzg	x2, [sp, #16]!
+stzg	sp, [sp, #16]!
+stg	x0, [x1], #-4096
+stg	x1, [x2], #4080
+stg	x2, [sp], #16
+stg	sp, [sp], #16
+stzg	x0, [x1], #-4096
+stzg	x1, [x2], #4080
+stzg	x2, [sp], #16
+stzg	sp, [sp], #16
+st2g	x0, [x1, #-4096]
+st2g	x1, [x2, #4080]
+st2g	x2, [sp, #16]
+st2g	x3, [x1]
+st2g	sp, [x1]
+stz2g	x0, [x1, #-4096]
+stz2g	x1, [x2, #4080]
+stz2g	x2, [sp, #16]
+stz2g	x3, [x1]
+stz2g	sp, [x1]
+st2g	x0, [x1, #-4096]!
+st2g	x1, [x2, #4080]!
+st2g	x2, [sp, #16]!
+st2g	sp, [sp, #16]!
+stz2g	x0, [x1, #-4096]!
+stz2g	x1, [x2, #4080]!
+stz2g	x2, [sp, #16]!
+stz2g	sp, [sp, #16]!
+st2g	x0, [x1], #-4096
+st2g	x1, [x2], #4080
+st2g	x2, [sp], #16
+st2g	sp, [sp], #16
+stz2g	x0, [x1], #-4096
+stz2g	x1, [x2], #4080
+stz2g	x2, [sp], #16
+stz2g	sp, [sp], #16
+stgp	x0, x1, [x2, #-1024]
+stgp	x0, x1, [x2, #1008]
+stgp	x0, x1, [sp, #16]
+stgp	xzr, x1, [x2, #16]
+stgp	x0, xzr, [x2, #16]
+stgp	x0, xzr, [x2]
+stgp	x0, x1, [x2, #-1024]!
+stgp	x0, x1, [x2, #1008]!
+stgp	x0, x1, [sp, #16]!
+stgp	xzr, x1, [x2, #16]!
+stgp	x0, xzr, [x2, #16]!
+stgp	x0, x1, [x2], #-1024
+stgp	x0, x1, [x2], #1008
+stgp	x0, x1, [sp], #16
+stgp	xzr, x1, [x2], #16
+stgp	x0, xzr, [x2], #16
+ldg	x0, [x1]
+ldg	x2, [sp, #-4096]
+ldg	x3, [x4, #4080]
+ldgm	x0, [x1]
+ldgm	x1, [sp]
+ldgm	xzr, [x2]
+stgm	x0, [x1]
+stgm	x1, [sp]
+stgm	xzr, [x2]
+stzgm	x0, [x1]
+stzgm	x1, [sp]
+stzgm	xzr, [x2]
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      1     1.00                  U     irg	x0, x1
+# CHECK-NEXT:  2      1     1.00                  U     irg	sp, x1
+# CHECK-NEXT:  2      1     1.00                  U     irg	x0, sp
+# CHECK-NEXT:  2      1     1.00                  U     irg	x0, x1, x2
+# CHECK-NEXT:  2      1     1.00                  U     irg	sp, x1, x2
+# CHECK-NEXT:  1      1     0.50                        addg	x0, x1, #0, #1
+# CHECK-NEXT:  1      1     0.50                        addg	sp, x2, #32, #3
+# CHECK-NEXT:  1      1     0.50                        addg	x0, sp, #64, #5
+# CHECK-NEXT:  1      1     0.50                        addg	x3, x4, #1008, #6
+# CHECK-NEXT:  1      1     0.50                        addg	x5, x6, #112, #15
+# CHECK-NEXT:  1      1     0.50                  U     subg	x0, x1, #0, #1
+# CHECK-NEXT:  1      1     0.50                  U     subg	sp, x2, #32, #3
+# CHECK-NEXT:  1      1     0.50                  U     subg	x0, sp, #64, #5
+# CHECK-NEXT:  1      1     0.50                  U     subg	x3, x4, #1008, #6
+# CHECK-NEXT:  1      1     0.50                  U     subg	x5, x6, #112, #15
+# CHECK-NEXT:  1      1     0.25                        gmi	x0, x1, x2
+# CHECK-NEXT:  1      1     0.25                        gmi	x3, sp, x4
+# CHECK-NEXT:  1      1     0.25                        gmi	xzr, x0, x30
+# CHECK-NEXT:  1      1     0.25                        gmi	x30, x0, xzr
+# CHECK-NEXT:  1      1     0.25                        subp	x0, x1, x2
+# CHECK-NEXT:  1      1     0.25                  U     subps	x0, x1, x2
+# CHECK-NEXT:  1      1     0.25                        subp	x0, sp, sp
+# CHECK-NEXT:  1      1     0.25                  U     subps	x0, sp, sp
+# CHECK-NEXT:  1      1     0.25                  U     subps	xzr, x0, x1
+# CHECK-NEXT:  1      1     0.25                  U     subps	xzr, sp, sp
+# CHECK-NEXT:  1      1     0.50           *            stg	x0, [x1, #-4096]
+# CHECK-NEXT:  1      1     0.50           *            stg	x1, [x2, #4080]
+# CHECK-NEXT:  1      1     0.50           *            stg	x2, [sp, #16]
+# CHECK-NEXT:  1      1     0.50           *            stg	x3, [x1]
+# CHECK-NEXT:  1      1     0.50           *            stg	sp, [x1]
+# CHECK-NEXT:  1      1     0.50           *            stzg	x0, [x1, #-4096]
+# CHECK-NEXT:  1      1     0.50           *            stzg	x1, [x2, #4080]
+# CHECK-NEXT:  1      1     0.50           *            stzg	x2, [sp, #16]
+# CHECK-NEXT:  1      1     0.50           *            stzg	x3, [x1]
+# CHECK-NEXT:  1      1     0.50           *            stzg	sp, [x1]
+# CHECK-NEXT:  1      1     0.50           *      U     stg	x0, [x1, #-4096]!
+# CHECK-NEXT:  1      1     0.50           *      U     stg	x1, [x2, #4080]!
+# CHECK-NEXT:  1      1     0.50           *      U     stg	x2, [sp, #16]!
+# CHECK-NEXT:  1      1     0.50           *      U     stg	sp, [sp, #16]!
+# CHECK-NEXT:  1      1     0.50           *      U     stzg	x0, [x1, #-4096]!
+# CHECK-NEXT:  1      1     0.50           *      U     stzg	x1, [x2, #4080]!
+# CHECK-NEXT:  1      1     0.50           *      U     stzg	x2, [sp, #16]!
+# CHECK-NEXT:  1      1     0.50           *      U     stzg	sp, [sp, #16]!
+# CHECK-NEXT:  1      1     0.50           *      U     stg	x0, [x1], #-4096
+# CHECK-NEXT:  1      1     0.50           *      U     stg	x1, [x2], #4080
+# CHECK-NEXT:  1      1     0.50           *      U     stg	x2, [sp], #16
+# CHECK-NEXT:  1      1     0.50           *      U     stg	sp, [sp], #16
+# CHECK-NEXT:  1      1     0.50           *      U     stzg	x0, [x1], #-4096
+# CHECK-NEXT:  1      1     0.50           *      U     stzg	x1, [x2], #4080
+# CHECK-NEXT:  1      1     0.50           *      U     stzg	x2, [sp], #16
+# CHECK-NEXT:  1      1     0.50           *      U     stzg	sp, [sp], #16
+# CHECK-NEXT:  2      1     1.00           *            st2g	x0, [x1, #-4096]
+# CHECK-NEXT:  2      1     1.00           *            st2g	x1, [x2, #4080]
+# CHECK-NEXT:  2      1     1.00           *            st2g	x2, [sp, #16]
+# CHECK-NEXT:  2      1     1.00           *            st2g	x3, [x1]
+# CHECK-NEXT:  2      1     1.00           *            st2g	sp, [x1]
+# CHECK-NEXT:  2      1     1.00           *            stz2g	x0, [x1, #-4096]
+# CHECK-NEXT:  2      1     1.00           *            stz2g	x1, [x2, #4080]
+# CHECK-NEXT:  2      1     1.00           *            stz2g	x2, [sp, #16]
+# CHECK-NEXT:  2      1     1.00           *            stz2g	x3, [x1]
+# CHECK-NEXT:  2      1     1.00           *            stz2g	sp, [x1]
+# CHECK-NEXT:  2      1     1.00           *      U     st2g	x0, [x1, #-4096]!
+# CHECK-NEXT:  2      1     1.00           *      U     st2g	x1, [x2, #4080]!
+# CHECK-NEXT:  2      1     1.00           *      U     st2g	x2, [sp, #16]!
+# CHECK-NEXT:  2      1     1.00           *      U     st2g	sp, [sp, #16]!
+# CHECK-NEXT:  2      1     1.00           *      U     stz2g	x0, [x1, #-4096]!
+# CHECK-NEXT:  2      1     1.00           *      U     stz2g	x1, [x2, #4080]!
+# CHECK-NEXT:  2      1     1.00           *      U     stz2g	x2, [sp, #16]!
+# CHECK-NEXT:  2      1     1.00           *      U     stz2g	sp, [sp, #16]!
+# CHECK-NEXT:  2      1     1.00           *      U     st2g	x0, [x1], #-4096
+# CHECK-NEXT:  2      1     1.00           *      U     st2g	x1, [x2], #4080
+# CHECK-NEXT:  2      1     1.00           *      U     st2g	x2, [sp], #16
+# CHECK-NEXT:  2      1     1.00           *      U     st2g	sp, [sp], #16
+# CHECK-NEXT:  2      1     1.00           *      U     stz2g	x0, [x1], #-4096
+# CHECK-NEXT:  2      1     1.00           *      U     stz2g	x1, [x2], #4080
+# CHECK-NEXT:  2      1     1.00           *      U     stz2g	x2, [sp], #16
+# CHECK-NEXT:  2      1     1.00           *      U     stz2g	sp, [sp], #16
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, x1, [x2, #-1024]
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, x1, [x2, #1008]
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, x1, [sp, #16]
+# CHECK-NEXT:  2      1     1.00           *            stgp	xzr, x1, [x2, #16]
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, xzr, [x2, #16]
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, xzr, [x2]
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, x1, [x2, #-1024]!
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, x1, [x2, #1008]!
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, x1, [sp, #16]!
+# CHECK-NEXT:  2      1     1.00           *            stgp	xzr, x1, [x2, #16]!
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, xzr, [x2, #16]!
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, x1, [x2], #-1024
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, x1, [x2], #1008
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, x1, [sp], #16
+# CHECK-NEXT:  2      1     1.00           *            stgp	xzr, x1, [x2], #16
+# CHECK-NEXT:  2      1     1.00           *            stgp	x0, xzr, [x2], #16
+# CHECK-NEXT:  2      4     0.50    *                   ldg	x0, [x1]
+# CHECK-NEXT:  2      4     0.50    *                   ldg	x2, [sp, #-4096]
+# CHECK-NEXT:  2      4     0.50    *                   ldg	x3, [x4, #4080]
+# CHECK-NEXT:  2      4     0.50    *             U     ldgm	x0, [x1]
+# CHECK-NEXT:  2      4     0.50    *             U     ldgm	x1, [sp]
+# CHECK-NEXT:  2      4     0.50    *             U     ldgm	xzr, [x2]
+# CHECK-NEXT:  1      1     0.50                  U     stgm	x0, [x1]
+# CHECK-NEXT:  1      1     0.50                  U     stgm	x1, [sp]
+# CHECK-NEXT:  1      1     0.50                  U     stgm	xzr, [x2]
+# CHECK-NEXT:  1      1     0.50                  U     stzgm	x0, [x1]
+# CHECK-NEXT:  1      1     0.50                  U     stzgm	x1, [sp]
+# CHECK-NEXT:  1      1     0.50                  U     stzgm	xzr, [x2]
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - Ampere1BUnitA
+# CHECK-NEXT: [0.1] - Ampere1BUnitA
+# CHECK-NEXT: [1.0] - Ampere1BUnitB
+# CHECK-NEXT: [1.1] - Ampere1BUnitB
+# CHECK-NEXT: [2]   - Ampere1BUnitBS
+# CHECK-NEXT: [3.0] - Ampere1BUnitL
+# CHECK-NEXT: [3.1] - Ampere1BUnitL
+# CHECK-NEXT: [4.0] - Ampere1BUnitS
+# CHECK-NEXT: [4.1] - Ampere1BUnitS
+# CHECK-NEXT: [5]   - Ampere1BUnitX
+# CHECK-NEXT: [6]   - Ampere1BUnitY
+# CHECK-NEXT: [7]   - Ampere1BUnitZ
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6]    [7]
+# CHECK-NEXT: 2.50   2.50   13.00  13.00  5.00   3.00   3.00   58.00  58.00   -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -     0.50   0.50   1.00    -      -      -      -      -      -      -     irg	x0, x1
+# CHECK-NEXT:  -      -     0.50   0.50   1.00    -      -      -      -      -      -      -     irg	sp, x1
+# CHECK-NEXT:  -      -     0.50   0.50   1.00    -      -      -      -      -      -      -     irg	x0, sp
+# CHECK-NEXT:  -      -     0.50   0.50   1.00    -      -      -      -      -      -      -     irg	x0, x1, x2
+# CHECK-NEXT:  -      -     0.50   0.50   1.00    -      -      -      -      -      -      -     irg	sp, x1, x2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     addg	x0, x1, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     addg	sp, x2, #32, #3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     addg	x0, sp, #64, #5
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     addg	x3, x4, #1008, #6
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     addg	x5, x6, #112, #15
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     subg	x0, x1, #0, #1
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     subg	sp, x2, #32, #3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     subg	x0, sp, #64, #5
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     subg	x3, x4, #1008, #6
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     subg	x5, x6, #112, #15
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     gmi	x0, x1, x2
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     gmi	x3, sp, x4
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     gmi	xzr, x0, x30
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     gmi	x30, x0, xzr
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subp	x0, x1, x2
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subps	x0, x1, x2
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subp	x0, sp, sp
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subps	x0, sp, sp
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subps	xzr, x0, x1
+# CHECK-NEXT: 0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     subps	xzr, sp, sp
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x0, [x1, #-4096]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x1, [x2, #4080]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x2, [sp, #16]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x3, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	sp, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x0, [x1, #-4096]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x1, [x2, #4080]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x2, [sp, #16]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x3, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	sp, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x0, [x1, #-4096]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x1, [x2, #4080]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x2, [sp, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	sp, [sp, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x0, [x1, #-4096]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x1, [x2, #4080]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x2, [sp, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	sp, [sp, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x0, [x1], #-4096
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x1, [x2], #4080
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	x2, [sp], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stg	sp, [sp], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x0, [x1], #-4096
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x1, [x2], #4080
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	x2, [sp], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzg	sp, [sp], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x0, [x1, #-4096]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x1, [x2, #4080]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x2, [sp, #16]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x3, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	sp, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x0, [x1, #-4096]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x1, [x2, #4080]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x2, [sp, #16]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x3, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	sp, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x0, [x1, #-4096]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x1, [x2, #4080]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x2, [sp, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	sp, [sp, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x0, [x1, #-4096]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x1, [x2, #4080]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x2, [sp, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	sp, [sp, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x0, [x1], #-4096
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x1, [x2], #4080
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	x2, [sp], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     st2g	sp, [sp], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x0, [x1], #-4096
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x1, [x2], #4080
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	x2, [sp], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stz2g	sp, [sp], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, x1, [x2, #-1024]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, x1, [x2, #1008]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, x1, [sp, #16]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	xzr, x1, [x2, #16]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, xzr, [x2, #16]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, xzr, [x2]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, x1, [x2, #-1024]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, x1, [x2, #1008]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, x1, [sp, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	xzr, x1, [x2, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, xzr, [x2, #16]!
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, x1, [x2], #-1024
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, x1, [x2], #1008
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, x1, [sp], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	xzr, x1, [x2], #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -     stgp	x0, xzr, [x2], #16
+# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -     ldg	x0, [x1]
+# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -     ldg	x2, [sp, #-4096]
+# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -     ldg	x3, [x4, #4080]
+# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -     ldgm	x0, [x1]
+# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -     ldgm	x1, [sp]
+# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -     ldgm	xzr, [x2]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stgm	x0, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stgm	x1, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stgm	xzr, [x2]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzgm	x0, [x1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzgm	x1, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     stzgm	xzr, [x2]
diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/neon-instructions.s
new file mode 100644
index 00000000000000..827c13a24763de
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/neon-instructions.s
@@ -0,0 +1,3235 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=ampere1b -instruction-tables < %s | FileCheck %s
+
+abs d29, d24
+abs v0.16b, v0.16b
+abs v0.2d, v0.2d
+abs v0.2s, v0.2s
+abs v0.4h, v0.4h
+abs v0.4s, v0.4s
+abs v0.8b, v0.8b
+abs v0.8h, v0.8h
+add d17, d31, d29
+add v0.8b, v0.8b, v0.8b
+addhn v0.2s, v0.2d, v0.2d
+addhn v0.4h, v0.4s, v0.4s
+addhn v0.8b, v0.8h, v0.8h
+addhn2 v0.16b, v0.8h, v0.8h
+addhn2 v0.4s, v0.2d, v0.2d
+addhn2 v0.8h, v0.4s, v0.4s
+addp v0.2d, v0.2d, v0.2d
+addp v0.8b, v0.8b, v0.8b
+and v0.8b, v0.8b, v0.8b
+bic v0.4h, #15, lsl #8
+bic v0.8b, v0.8b, v0.8b
+bif v0.16b, v0.16b, v0.16b
+bit v0.16b, v0.16b, v0.16b
+bsl v0.8b, v0.8b, v0.8b
+cls v0.16b, v0.16b
+cls v0.2s, v0.2s
+cls v0.4h, v0.4h
+cls v0.4s, v0.4s
+cls v0.8b, v0.8b
+cls v0.8h, v0.8h
+clz v0.16b, v0.16b
+clz v0.2s, v0.2s
+clz v0.4h, v0.4h
+clz v0.4s, v0.4s
+clz v0.8b, v0.8b
+clz v0.8h, v0.8h
+cmeq d20, d21, 0
+cmeq d20, d21, d22
+cmeq v0.16b, v0.16b, 0
+cmeq v0.16b, v0.16b, v0.16b
+cmge d20, d21, 0
+cmge d20, d21, d22
+cmge v0.4h, v0.4h, v0.4h
+cmge v0.8b, v0.8b, 0
+cmgt d20, d21, 0
+cmgt d20, d21, d22
+cmgt v0.2s, v0.2s, 0
+cmgt v0.4s, v0.4s, v0.4s
+cmhi d20, d21, d22
+cmhi v0.8h, v0.8h, v0.8h
+cmhs d20, d21, d22
+cmhs v0.8b, v0.8b, v0.8b
+cmle d20, d21, 0
+cmle v0.2d, v0.2d, 0
+cmlt d20, d21, 0
+cmlt v0.8h, v0.8h, 0
+cmtst d20, d21, d22
+cmtst v0.2s, v0.2s, v0.2s
+cnt v0.16b, v0.16b
+cnt v0.8b, v0.8b
+dup v0.16b,w28
+dup v0.2d,x28
+dup v0.2s,w28
+dup v0.4h,w28
+dup v0.4s,w28
+dup v0.8b,w28
+dup v0.8h,w28
+eor v0.16b, v0.16b, v0.16b
+ext v0.16b, v0.16b, v0.16b, #3
+ext v0.8b, v0.8b, v0.8b, #3
+fabd d29, d24, d20
+fabd s29, s24, s20
+fabd v0.4s, v0.4s, v0.4s
+fabs v0.2d, v0.2d
+fabs v0.2s, v0.2s
+fabs v0.4h, v0.4h
+fabs v0.4s, v0.4s
+fabs v0.8h, v0.8h
+facge d20, d21, d22
+facge s10, s11, s12
+facge v0.4s, v0.4s, v0.4s
+facgt d20, d21, d22
+facgt s10, s11, s12
+facgt v0.2d, v0.2d, v0.2d
+fadd v0.4s, v0.4s, v0.4s
+faddp v0.2s, v0.2s, v0.2s
+faddp v0.4s, v0.4s, v0.4s
+fcmeq d20, d21, #0.0
+fcmeq d20, d21, d22
+fcmeq s10, s11, #0.0
+fcmeq s10, s11, s12
+fcmeq v0.2s, v0.2s, #0.0
+fcmeq v0.2s, v0.2s, v0.2s
+fcmge d20, d21, #0.0
+fcmge d20, d21, d22
+fcmge s10, s11, #0.0
+fcmge s10, s11, s12
+fcmge v0.2d, v0.2d, #0.0
+fcmge v0.4s, v0.4s, v0.4s
+fcmgt d20, d21, #0.0
+fcmgt d20, d21, d22
+fcmgt s10, s11, #0.0
+fcmgt s10, s11, s12
+fcmgt v0.4s, v0.4s, #0.0
+fcmgt v0.4s, v0.4s, v0.4s
+fcmle d20, d21, #0.0
+fcmle s10, s11, #0.0
+fcmle v0.2d, v0.2d, #0.0
+fcmlt d20, d21, #0.0
+fcmlt s10, s11, #0.0
+fcmlt v0.4s, v0.4s, #0.0
+fcvtas d21, d14
+fcvtas s12, s13
+fcvtas v0.2d, v0.2d
+fcvtas v0.2s, v0.2s
+fcvtas v0.4h, v0.4h
+fcvtas v0.4s, v0.4s
+fcvtas v0.8h, v0.8h
+fcvtau d21, d14
+fcvtau s12, s13
+fcvtau v0.2d, v0.2d
+fcvtau v0.2s, v0.2s
+fcvtau v0.4h, v0.4h
+fcvtau v0.4s, v0.4s
+fcvtau v0.8h, v0.8h
+fcvtl v0.2d, v0.2s
+fcvtl v0.4s, v0.4h
+fcvtl2 v0.2d, v0.4s
+fcvtl2 v0.4s, v0.8h
+fcvtms d21, d14
+fcvtms s22, s13
+fcvtms v0.2d, v0.2d
+fcvtms v0.2s, v0.2s
+fcvtms v0.4h, v0.4h
+fcvtms v0.4s, v0.4s
+fcvtms v0.8h, v0.8h
+fcvtmu d21, d14
+fcvtmu s12, s13
+fcvtmu v0.2d, v0.2d
+fcvtmu v0.2s, v0.2s
+fcvtmu v0.4h, v0.4h
+fcvtmu v0.4s, v0.4s
+fcvtmu v0.8h, v0.8h
+fcvtn v0.2s, v0.2d
+fcvtn v0.4h, v0.4s
+fcvtn2 v0.4s, v0.2d
+fcvtn2 v0.8h, v0.4s
+fcvtns d21, d14
+fcvtns s22, s13
+fcvtns v0.2d, v0.2d
+fcvtns v0.2s, v0.2s
+fcvtns v0.4h, v0.4h
+fcvtns v0.4s, v0.4s
+fcvtns v0.8h, v0.8h
+fcvtnu d21, d14
+fcvtnu s12, s13
+fcvtnu v0.2d, v0.2d
+fcvtnu v0.2s, v0.2s
+fcvtnu v0.4h, v0.4h
+fcvtnu v0.4s, v0.4s
+fcvtnu v0.8h, v0.8h
+fcvtps d21, d14
+fcvtps s22, s13
+fcvtps v0.2d, v0.2d
+fcvtps v0.2s, v0.2s
+fcvtps v0.4h, v0.4h
+fcvtps v0.4s, v0.4s
+fcvtps v0.8h, v0.8h
+fcvtpu d21, d14
+fcvtpu s12, s13
+fcvtpu v0.2d, v0.2d
+fcvtpu v0.2s, v0.2s
+fcvtpu v0.4h, v0.4h
+fcvtpu v0.4s, v0.4s
+fcvtpu v0.8h, v0.8h
+fcvtxn s22, d13
+fcvtxn v0.2s, v0.2d
+fcvtxn2 v0.4s, v0.2d
+fcvtzs d21, d12, #1
+fcvtzs d21, d14
+fcvtzs s12, s13
+fcvtzs s21, s12, #1
+fcvtzs v0.2d, v0.2d
+fcvtzs v0.2d, v0.2d, #3
+fcvtzs v0.2s, v0.2s
+fcvtzs v0.2s, v0.2s, #3
+fcvtzs v0.4h, v0.4h
+fcvtzs v0.4s, v0.4s
+fcvtzs v0.4s, v0.4s, #3
+fcvtzs v0.8h, v0.8h
+fcvtzu d21, d12, #1
+fcvtzu d21, d14
+fcvtzu s12, s13
+fcvtzu s21, s12, #1
+fcvtzu v0.2d, v0.2d
+fcvtzu v0.2d, v0.2d, #3
+fcvtzu v0.2s, v0.2s
+fcvtzu v0.2s, v0.2s, #3
+fcvtzu v0.4h, v0.4h
+fcvtzu v0.4s, v0.4s
+fcvtzu v0.4s, v0.4s, #3
+fcvtzu v0.8h, v0.8h
+fdiv v0.2s, v0.2s, v0.2s
+fmax v0.2d, v0.2d, v0.2d
+fmax v0.2s, v0.2s, v0.2s
+fmax v0.4s, v0.4s, v0.4s
+fmaxnm v0.2d, v0.2d, v0.2d
+fmaxnm v0.2s, v0.2s, v0.2s
+fmaxnm v0.4s, v0.4s, v0.4s
+fmaxnmp v0.2d, v0.2d, v0.2d
+fmaxnmp v0.2s, v0.2s, v0.2s
+fmaxnmp v0.4s, v0.4s, v0.4s
+fmaxp v0.2d, v0.2d, v0.2d
+fmaxp v0.2s, v0.2s, v0.2s
+fmaxp v0.4s, v0.4s, v0.4s
+fmin v0.2d, v0.2d, v0.2d
+fmin v0.2s, v0.2s, v0.2s
+fmin v0.4s, v0.4s, v0.4s
+fminnm v0.2d, v0.2d, v0.2d
+fminnm v0.2s, v0.2s, v0.2s
+fminnm v0.4s, v0.4s, v0.4s
+fminnmp v0.2d, v0.2d, v0.2d
+fminnmp v0.2s, v0.2s, v0.2s
+fminnmp v0.4s, v0.4s, v0.4s
+fminp v0.2d, v0.2d, v0.2d
+fminp v0.2s, v0.2s, v0.2s
+fminp v0.4s, v0.4s, v0.4s
+fmla d0, d1, v0.d[1]
+fmla s0, s1, v0.s[3]
+fmla v0.2s, v0.2s, v0.2s
+fmls d0, d4, v0.d[1]
+fmls s3, s5, v0.s[3]
+fmls v0.2s, v0.2s, v0.2s
+fmov v0.2d, #-1.25
+fmov v0.2s, #13.0
+fmov v0.4s, #1.0
+fmul d0, d1, v0.d[1]
+fmul s0, s1, v0.s[3]
+fmul v0.2s, v0.2s, v0.2s
+fmulx d0, d4, v0.d[1]
+fmulx d23, d11, d1
+fmulx s20, s22, s15
+fmulx s3, s5, v0.s[3]
+fmulx v0.2d, v0.2d, v0.2d
+fmulx v0.2s, v0.2s, v0.2s
+fmulx v0.4s, v0.4s, v0.4s
+fneg v0.2d, v0.2d
+fneg v0.2s, v0.2s
+fneg v0.4h, v0.4h
+fneg v0.4s, v0.4s
+fneg v0.8h, v0.8h
+frecpe d13, d13
+frecpe s19, s14
+frecpe v0.2d, v0.2d
+frecpe v0.2s, v0.2s
+frecpe v0.4h, v0.4h
+frecpe v0.4s, v0.4s
+frecpe v0.8h, v0.8h
+frecps  v0.4s, v0.4s, v0.4s
+frecps d22, d30, d21
+frecps s21, s16, s13
+frecpx d16, d19
+frecpx s18, s10
+frinta v0.2d, v0.2d
+frinta v0.2s, v0.2s
+frinta v0.4h, v0.4h
+frinta v0.4s, v0.4s
+frinta v0.8h, v0.8h
+frinti v0.2d, v0.2d
+frinti v0.2s, v0.2s
+frinti v0.4h, v0.4h
+frinti v0.4s, v0.4s
+frinti v0.8h, v0.8h
+frintm v0.2d, v0.2d
+frintm v0.2s, v0.2s
+frintm v0.4h, v0.4h
+frintm v0.4s, v0.4s
+frintm v0.8h, v0.8h
+frintn v0.2d, v0.2d
+frintn v0.2s, v0.2s
+frintn v0.4h, v0.4h
+frintn v0.4s, v0.4s
+frintn v0.8h, v0.8h
+frintp v0.2d, v0.2d
+frintp v0.2s, v0.2s
+frintp v0.4h, v0.4h
+frintp v0.4s, v0.4s
+frintp v0.8h, v0.8h
+frintx v0.2d, v0.2d
+frintx v0.2s, v0.2s
+frintx v0.4h, v0.4h
+frintx v0.4s, v0.4s
+frintx v0.8h, v0.8h
+frintz v0.2d, v0.2d
+frintz v0.2s, v0.2s
+frintz v0.4h, v0.4h
+frintz v0.4s, v0.4s
+frintz v0.8h, v0.8h
+frsqrte d21, d12
+frsqrte s22, s13
+frsqrte v0.2d, v0.2d
+frsqrte v0.2s, v0.2s
+frsqrte v0.4h, v0.4h
+frsqrte v0.4s, v0.4s
+frsqrte v0.8h, v0.8h
+frsqrts d8, d22, d18
+frsqrts s21, s5, s12
+frsqrts v0.2d, v0.2d, v0.2d
+fsqrt v0.2d, v0.2d
+fsqrt v0.2s, v0.2s
+fsqrt v0.4h, v0.4h
+fsqrt v0.4s, v0.4s
+fsqrt v0.8h, v0.8h
+fsub v0.2s, v0.2s, v0.2s
+ld1 { v0.16b }, [x0]
+ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48
+ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+ld1 { v0.4s, v1.4s }, [sp], #32
+ld1 { v0.4s, v1.4s, v2.4s }, [sp]
+ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+ld1 { v0.8h }, [x15], x2
+ld1 { v0.8h, v1.8h }, [x15]
+ld1 { v0.b }[9], [x0]
+ld1 { v0.b }[9], [x0], #1
+ld1r { v0.16b }, [x0]
+ld1r { v0.16b }, [x0], #1
+ld1r { v0.8h }, [x15]
+ld1r { v0.8h }, [x15], #2
+ld2 { v0.16b, v1.16b }, [x0], x1
+ld2 { v0.8b, v1.8b }, [x0]
+ld2 { v0.h, v1.h }[7], [x15]
+ld2 { v0.h, v1.h }[7], [x15], #4
+ld2r { v0.2d, v1.2d }, [x0]
+ld2r { v0.2d, v1.2d }, [x0], #16
+ld2r { v0.4s, v1.4s }, [sp]
+ld2r { v0.4s, v1.4s }, [sp], #8
+ld3 { v0.4h, v1.4h, v2.4h }, [x15]
+ld3 { v0.8h, v1.8h, v2.8h }, [x15], x2
+ld3 { v0.s, v1.s, v2.s }[3], [sp]
+ld3 { v0.s, v1.s, v2.s }[3], [sp], x3
+ld3r { v0.4h, v1.4h, v2.4h }, [x15]
+ld3r { v0.4h, v1.4h, v2.4h }, [x15], #6
+ld3r { v0.8b, v1.8b, v2.8b }, [x0]
+ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3
+ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0]
+ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32
+ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0], x0
+ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp]
+ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7
+ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30
+mla v0.8b, v0.8b, v0.8b
+mls v0.4h, v0.4h, v0.4h
+mov b0, v0.b[15]
+mov d6, v0.d[1]
+mov h2, v0.h[5]
+mov s17, v0.s[2]
+mov v2.b[0], v0.b[0]
+mov v2.h[1], v0.h[1]
+mov v2.s[2], v0.s[2]
+mov v2.d[1], v0.d[1]
+mov v0.b[0], w8
+mov v0.h[1], w8
+mov v0.s[2], w8
+mov v0.d[1], x8
+mov v0.16b, v0.16b
+mov v0.8b, v0.8b
+movi d15, #0xff00ff00ff00ff
+movi v0.16b, #31
+movi v0.2d, #0xff0000ff0000ffff
+movi v0.2s, #8, msl #8
+movi v0.4s, #255, lsl #24
+movi v0.8b, #255
+mul v0.8b, v0.8b, v0.8b
+mvni v0.2s, 0
+mvni v0.4s, #16, msl #16
+neg d29, d24
+neg v0.16b, v0.16b
+neg v0.2d, v0.2d
+neg v0.2s, v0.2s
+neg v0.4h, v0.4h
+neg v0.4s, v0.4s
+neg v0.8b, v0.8b
+neg v0.8h, v0.8h
+not v0.16b, v0.16b
+not v0.8b, v0.8b
+orn v0.16b, v0.16b, v0.16b
+orr v0.16b, v0.16b, v0.16b
+orr v0.8h, #31
+pmul v0.16b, v0.16b, v0.16b
+pmul v0.8b, v0.8b, v0.8b
+pmull v0.8h, v0.8b, v0.8b
+pmull2 v0.8h, v0.16b, v0.16b
+raddhn v0.2s, v0.2d, v0.2d
+raddhn v0.4h, v0.4s, v0.4s
+raddhn v0.8b, v0.8h, v0.8h
+raddhn2 v0.16b, v0.8h, v0.8h
+raddhn2 v0.4s, v0.2d, v0.2d
+raddhn2 v0.8h, v0.4s, v0.4s
+rbit v0.16b, v0.16b
+rbit v0.8b, v0.8b
+rev16 v21.8b, v1.8b
+rev16 v30.16b, v31.16b
+rev32 v0.4h, v9.4h
+rev32 v21.8b, v1.8b
+rev32 v30.16b, v31.16b
+rev32 v4.8h, v7.8h
+rev64 v0.16b, v31.16b
+rev64 v1.8b, v9.8b
+rev64 v13.4h, v21.4h
+rev64 v2.8h, v4.8h
+rev64 v4.2s, v0.2s
+rev64 v6.4s, v8.4s
+rshrn v0.2s, v0.2d, #3
+rshrn v0.4h, v0.4s, #3
+rshrn v0.8b, v0.8h, #3
+rshrn2 v0.16b, v0.8h, #3
+rshrn2 v0.4s, v0.2d, #3
+rshrn2 v0.8h, v0.4s, #3
+rsubhn v0.2s, v0.2d, v0.2d
+rsubhn v0.4h, v0.4s, v0.4s
+rsubhn v0.8b, v0.8h, v0.8h
+rsubhn2 v0.16b, v0.8h, v0.8h
+rsubhn2 v0.4s, v0.2d, v0.2d
+rsubhn2 v0.8h, v0.4s, v0.4s
+saba v0.16b, v0.16b, v0.16b
+sabal v0.2d, v0.2s, v0.2s
+sabal v0.4s, v0.4h, v0.4h
+sabal v0.8h, v0.8b, v0.8b
+sabal2 v0.2d, v0.4s, v0.4s
+sabal2 v0.4s, v0.8h, v0.8h
+sabal2 v0.8h, v0.16b, v0.16b
+sabd v0.4h, v0.4h, v0.4h
+sabdl v0.2d, v0.2s, v0.2s
+sabdl v0.4s, v0.4h, v0.4h
+sabdl v0.8h, v0.8b, v0.8b
+sabdl2 v0.2d, v0.4s, v0.4s
+sabdl2 v0.4s, v0.8h, v0.8h
+sabdl2 v0.8h, v0.16b, v0.16b
+sadalp v0.1d, v0.2s
+sadalp v0.2d, v0.4s
+sadalp v0.2s, v0.4h
+sadalp v0.4h, v0.8b
+sadalp v0.4s, v0.8h
+sadalp v0.8h, v0.16b
+saddl v0.2d, v0.2s, v0.2s
+saddl v0.4s, v0.4h, v0.4h
+saddl v0.8h, v0.8b, v0.8b
+saddl2 v0.2d, v0.4s, v0.4s
+saddl2 v0.4s, v0.8h, v0.8h
+saddl2 v0.8h, v0.16b, v0.16b
+saddlp v0.1d, v0.2s
+saddlp v0.2d, v0.4s
+saddlp v0.2s, v0.4h
+saddlp v0.4h, v0.8b
+saddlp v0.4s, v0.8h
+saddlp v0.8h, v0.16b
+saddw v0.2d, v0.2d, v0.2s
+saddw v0.4s, v0.4s, v0.4h
+saddw v0.8h, v0.8h, v0.8b
+saddw2 v0.2d, v0.2d, v0.4s
+saddw2 v0.4s, v0.4s, v0.8h
+saddw2 v0.8h, v0.8h, v0.16b
+scvtf d21, d12
+scvtf d21, d12, #64
+scvtf s22, s13
+scvtf s22, s13, #32
+scvtf v0.2d, v0.2d
+scvtf v0.2d, v0.2d, #3
+scvtf v0.2s, v0.2s
+scvtf v0.2s, v0.2s, #3
+scvtf v0.4h, v0.4h
+scvtf v0.4s, v0.4s
+scvtf v0.4s, v0.4s, #3
+scvtf v0.8h, v0.8h
+shadd v0.8b, v0.8b, v0.8b
+shl d7, d10, #12
+shl v0.16b, v0.16b, #3
+shl v0.2d, v0.2d, #3
+shl v0.4h, v0.4h, #3
+shl v0.4s, v0.4s, #3
+shll	v0.2d, v0.2s, #32
+shll	v0.4s, v0.4h, #16
+shll	v0.8h, v0.8b, #8
+shll v0.2d, v0.2s, #32
+shll v0.4s, v0.4h, #16
+shll v0.8h, v0.8b, #8
+shll2	v0.2d, v0.4s, #32
+shll2	v0.4s, v0.8h, #16
+shll2	v0.8h, v0.16b, #8
+shll2 v0.2d, v0.4s, #32
+shll2 v0.4s, v0.8h, #16
+shll2 v0.8h, v0.16b, #8
+shrn v0.2s, v0.2d, #3
+shrn v0.4h, v0.4s, #3
+shrn v0.8b, v0.8h, #3
+shrn2 v0.16b, v0.8h, #3
+shrn2 v0.4s, v0.2d, #3
+shrn2 v0.8h, v0.4s, #3
+shsub v0.2s, v0.2s, v0.2s
+shsub v0.4h, v0.4h, v0.4h
+sli d10, d14, #12
+sli v0.16b, v0.16b, #3
+sli v0.2d, v0.2d, #3
+sli v0.2s, v0.2s, #3
+sli v0.4h, v0.4h, #3
+sli v0.4s, v0.4s, #3
+sli v0.8b, v0.8b, #3
+sli v0.8h, v0.8h, #3
+smax v0.2s, v0.2s, v0.2s
+smax v0.4h, v0.4h, v0.4h
+smax v0.8b, v0.8b, v0.8b
+smaxp v0.2s, v0.2s, v0.2s
+smaxp v0.4h, v0.4h, v0.4h
+smaxp v0.8b, v0.8b, v0.8b
+smin v0.16b, v0.16b, v0.16b
+smin v0.4s, v0.4s, v0.4s
+smin v0.8h, v0.8h, v0.8h
+sminp v0.16b, v0.16b, v0.16b
+sminp v0.4s, v0.4s, v0.4s
+sminp v0.8h, v0.8h, v0.8h
+smlal v0.2d, v0.2s, v0.2s
+smlal v0.4s, v0.4h, v0.4h
+smlal v0.8h, v0.8b, v0.8b
+smlal2 v0.2d, v0.4s, v0.4s
+smlal2 v0.4s, v0.8h, v0.8h
+smlal2 v0.8h, v0.16b, v0.16b
+smlsl v0.2d, v0.2s, v0.2s
+smlsl v0.4s, v0.4h, v0.4h
+smlsl v0.8h, v0.8b, v0.8b
+smlsl2 v0.2d, v0.4s, v0.4s
+smlsl2 v0.4s, v0.8h, v0.8h
+smlsl2 v0.8h, v0.16b, v0.16b
+smull v0.2d, v0.2s, v0.2s
+smull v0.4s, v0.4h, v0.4h
+smull v0.8h, v0.8b, v0.8b
+smull2 v0.2d, v0.4s, v0.4s
+smull2 v0.4s, v0.8h, v0.8h
+smull2 v0.8h, v0.16b, v0.16b
+sqabs b19, b14
+sqabs d18, d12
+sqabs h21, h15
+sqabs s20, s12
+sqabs v0.16b, v0.16b
+sqabs v0.2d, v0.2d
+sqabs v0.2s, v0.2s
+sqabs v0.4h, v0.4h
+sqabs v0.4s, v0.4s
+sqabs v0.8b, v0.8b
+sqabs v0.8h, v0.8h
+sqadd b20, b11, b15
+sqadd v0.16b, v0.16b, v0.16b
+sqadd v0.2s, v0.2s, v0.2s
+sqdmlal d19, s24, s12
+sqdmlal d8, s9, v0.s[1]
+sqdmlal s0, h0, v0.h[3]
+sqdmlal s17, h27, h12
+sqdmlal v0.2d, v0.2s, v0.2s
+sqdmlal v0.4s, v0.4h, v0.4h
+sqdmlal2 v0.2d, v0.4s, v0.4s
+sqdmlal2 v0.4s, v0.8h, v0.8h
+sqdmlsl d12, s23, s13
+sqdmlsl d8, s9, v0.s[1]
+sqdmlsl s0, h0, v0.h[3]
+sqdmlsl s14, h12, h25
+sqdmlsl v0.2d, v0.2s, v0.2s
+sqdmlsl v0.4s, v0.4h, v0.4h
+sqdmlsl2 v0.2d, v0.4s, v0.4s
+sqdmlsl2 v0.4s, v0.8h, v0.8h
+sqdmulh h10, h11, h12
+sqdmulh h7, h15, v0.h[3]
+sqdmulh s15, s14, v0.s[1]
+sqdmulh s20, s21, s2
+sqdmulh v0.2s, v0.2s, v0.2s
+sqdmulh v0.4s, v0.4s, v0.4s
+sqdmull d1, s1, v0.s[1]
+sqdmull d15, s22, s12
+sqdmull s1, h1, v0.h[3]
+sqdmull s12, h22, h12
+sqdmull v0.2d, v0.2s, v0.2s
+sqdmull v0.4s, v0.4h, v0.4h
+sqdmull2 v0.2d, v0.4s, v0.4s
+sqdmull2 v0.4s, v0.8h, v0.8h
+sqneg b19, b14
+sqneg d18, d12
+sqneg h21, h15
+sqneg s20, s12
+sqneg v0.16b, v0.16b
+sqneg v0.2d, v0.2d
+sqneg v0.2s, v0.2s
+sqneg v0.4h, v0.4h
+sqneg v0.4s, v0.4s
+sqneg v0.8b, v0.8b
+sqneg v0.8h, v0.8h
+sqrdmulh h10, h11, h12
+sqrdmulh h7, h15, v0.h[3]
+sqrdmulh s15, s14, v0.s[1]
+sqrdmulh s20, s21, s2
+sqrdmulh v0.4h, v0.4h, v0.4h
+sqrdmulh v0.8h, v0.8h, v0.8h
+sqrshl d31, d31, d31
+sqrshl h3, h4, h15
+sqrshl v0.2s, v0.2s, v0.2s
+sqrshl v0.4h, v0.4h, v0.4h
+sqrshl v0.8b, v0.8b, v0.8b
+sqrshrn b10, h13, #2
+sqrshrn h15, s10, #6
+sqrshrn s15, d12, #9
+sqrshrn v0.2s, v0.2d, #3
+sqrshrn v0.4h, v0.4s, #3
+sqrshrn v0.8b, v0.8h, #3
+sqrshrn2 v0.16b, v0.8h, #3
+sqrshrn2 v0.4s, v0.2d, #3
+sqrshrn2 v0.8h, v0.4s, #3
+sqrshrun b17, h10, #6
+sqrshrun h10, s13, #15
+sqrshrun s22, d16, #31
+sqrshrun v0.2s, v0.2d, #3
+sqrshrun v0.4h, v0.4s, #3
+sqrshrun v0.8b, v0.8h, #3
+sqrshrun2 v0.16b, v0.8h, #3
+sqrshrun2 v0.4s, v0.2d, #3
+sqrshrun2 v0.8h, v0.4s, #3
+sqshl b11, b19, #7
+sqshl d15, d16, #51
+sqshl d31, d31, d31
+sqshl h13, h18, #11
+sqshl h3, h4, h15
+sqshl s14, s17, #22
+sqshl v0.16b, v0.16b, #3
+sqshl v0.2d, v0.2d, #3
+sqshl v0.2s, v0.2s, #3
+sqshl v0.2s, v0.2s, v0.2s
+sqshl v0.4h, v0.4h, #3
+sqshl v0.4h, v0.4h, v0.4h
+sqshl v0.4s, v0.4s, #3
+sqshl v0.8b, v0.8b, #3
+sqshl v0.8b, v0.8b, v0.8b
+sqshl v0.8h, v0.8h, #3
+sqshlu b15, b18, #6
+sqshlu d11, d13, #32
+sqshlu h19, h17, #6
+sqshlu s16, s14, #25
+sqshlu v0.16b, v0.16b, #3
+sqshlu v0.2d, v0.2d, #3
+sqshlu v0.2s, v0.2s, #3
+sqshlu v0.4h, v0.4h, #3
+sqshlu v0.4s, v0.4s, #3
+sqshlu v0.8b, v0.8b, #3
+sqshlu v0.8h, v0.8h, #3
+sqshrn b10, h15, #5
+sqshrn h17, s10, #4
+sqshrn s18, d10, #31
+sqshrn v0.2s, v0.2d, #3
+sqshrn v0.4h, v0.4s, #3
+sqshrn v0.8b, v0.8h, #3
+sqshrn2 v0.16b, v0.8h, #3
+sqshrn2 v0.4s, v0.2d, #3
+sqshrn2 v0.8h, v0.4s, #3
+sqshrun b15, h10, #7
+sqshrun h20, s14, #3
+sqshrun s10, d15, #15
+sqshrun v0.2s, v0.2d, #3
+sqshrun v0.4h, v0.4s, #3
+sqshrun v0.8b, v0.8h, #3
+sqshrun2 v0.16b, v0.8h, #3
+sqshrun2 v0.4s, v0.2d, #3
+sqshrun2 v0.8h, v0.4s, #3
+sqsub s20, s10, s7
+sqsub v0.2d, v0.2d, v0.2d
+sqsub v0.4s, v0.4s, v0.4s
+sqsub v0.8b, v0.8b, v0.8b
+sqxtn b18, h18
+sqxtn h20, s17
+sqxtn s19, d14
+sqxtn v0.2s, v0.2d
+sqxtn v0.4h, v0.4s
+sqxtn v0.8b, v0.8h
+sqxtn2 v0.16b, v0.8h
+sqxtn2 v0.4s, v0.2d
+sqxtn2 v0.8h, v0.4s
+sqxtun b19, h14
+sqxtun h21, s15
+sqxtun s20, d12
+sqxtun v0.2s, v0.2d
+sqxtun v0.4h, v0.4s
+sqxtun v0.8b, v0.8h
+sqxtun2 v0.16b, v0.8h
+sqxtun2 v0.4s, v0.2d
+sqxtun2 v0.8h, v0.4s
+srhadd v0.2s, v0.2s, v0.2s
+srhadd v0.4h, v0.4h, v0.4h
+srhadd v0.8b, v0.8b, v0.8b
+sri d10, d12, #14
+sri v0.16b, v0.16b, #3
+sri v0.2d, v0.2d, #3
+sri v0.2s, v0.2s, #3
+sri v0.4h, v0.4h, #3
+sri v0.4s, v0.4s, #3
+sri v0.8b, v0.8b, #3
+sri v0.8h, v0.8h, #3
+srshl d16, d16, d16
+srshl v0.2s, v0.2s, v0.2s
+srshl v0.4h, v0.4h, v0.4h
+srshl v0.8b, v0.8b, v0.8b
+srshr d19, d18, #7
+srshr v0.16b, v0.16b, #3
+srshr v0.2d, v0.2d, #3
+srshr v0.2s, v0.2s, #3
+srshr v0.4h, v0.4h, #3
+srshr v0.4s, v0.4s, #3
+srshr v0.8b, v0.8b, #3
+srshr v0.8h, v0.8h, #3
+srsra d15, d11, #19
+srsra v0.16b, v0.16b, #3
+srsra v0.2d, v0.2d, #3
+srsra v0.2s, v0.2s, #3
+srsra v0.4h, v0.4h, #3
+srsra v0.4s, v0.4s, #3
+srsra v0.8b, v0.8b, #3
+srsra v0.8h, v0.8h, #3
+sshl d31, d31, d31
+sshl v0.2d, v0.2d, v0.2d
+sshl v0.2s, v0.2s, v0.2s
+sshl v0.4h, v0.4h, v0.4h
+sshl v0.8b, v0.8b, v0.8b
+sshll v0.2d, v0.2s, #3
+sshll2 v0.4s, v0.8h, #3
+sshr d15, d16, #12
+sshr v0.16b, v0.16b, #3
+sshr v0.2d, v0.2d, #3
+sshr v0.2s, v0.2s, #3
+sshr v0.4h, v0.4h, #3
+sshr v0.4s, v0.4s, #3
+sshr v0.8b, v0.8b, #3
+sshr v0.8h, v0.8h, #3
+ssra d18, d12, #21
+ssra v0.16b, v0.16b, #3
+ssra v0.2d, v0.2d, #3
+ssra v0.2s, v0.2s, #3
+ssra v0.4h, v0.4h, #3
+ssra v0.4s, v0.4s, #3
+ssra v0.8b, v0.8b, #3
+ssra v0.8h, v0.8h, #3
+ssubl v0.2d, v0.2s, v0.2s
+ssubl v0.4s, v0.4h, v0.4h
+ssubl v0.8h, v0.8b, v0.8b
+ssubl2 v0.2d, v0.4s, v0.4s
+ssubl2 v0.4s, v0.8h, v0.8h
+ssubl2 v0.8h, v0.16b, v0.16b
+ssubw v0.2d, v0.2d, v0.2s
+ssubw v0.4s, v0.4s, v0.4h
+ssubw v0.8h, v0.8h, v0.8b
+ssubw2 v0.2d, v0.2d, v0.4s
+ssubw2 v0.4s, v0.4s, v0.8h
+ssubw2 v0.8h, v0.8h, v0.16b
+st1 { v0.16b }, [x0]
+st1 { v0.2d, v1.2d, v2.2d }, [x0], #48
+st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+st1 { v0.4s, v1.4s }, [sp], #32
+st1 { v0.4s, v1.4s, v2.4s }, [sp]
+st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+st1 { v0.8h }, [x15], x2
+st1 { v0.8h, v1.8h }, [x15]
+st1 { v0.d }[1], [x0]
+st1 { v0.d }[1], [x0], #8
+st2 { v0.16b, v1.16b }, [x0], x1
+st2 { v0.8b, v1.8b }, [x0]
+st2 { v0.s, v1.s }[3], [sp]
+st2 { v0.s, v1.s }[3], [sp], #8
+st3 { v0.4h, v1.4h, v2.4h }, [x15]
+st3 { v0.8h, v1.8h, v2.8h }, [x15], x2
+st3 { v0.h, v1.h, v2.h }[7], [x15]
+st3 { v0.h, v1.h, v2.h }[7], [x15], #6
+st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0]
+st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
+sub d15, d5, d16
+sub v0.2d, v0.2d, v0.2d
+suqadd b19, b14
+suqadd d18, d22
+suqadd h20, h15
+suqadd s21, s12
+suqadd v0.16b, v0.16b
+suqadd v0.2d, v0.2d
+suqadd v0.2s, v0.2s
+suqadd v0.4h, v0.4h
+suqadd v0.4s, v0.4s
+suqadd v0.8b, v0.8b
+suqadd v0.8h, v0.8h
+tbl v0.16b, { v0.16b }, v0.16b
+tbl v0.16b, { v0.16b, v1.16b }, v0.16b
+tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+tbl v0.8b, { v0.16b }, v0.8b
+tbl v0.8b, { v0.16b, v1.16b }, v0.8b
+tbl v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+tbx v0.16b, { v0.16b }, v0.16b
+tbx v0.16b, { v0.16b, v1.16b }, v0.16b
+tbx v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+tbx v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+tbx v0.8b, { v0.16b }, v0.8b
+tbx v0.8b, { v0.16b, v1.16b }, v0.8b
+tbx v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+tbx v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+trn1	v0.16b, v0.16b, v0.16b
+trn1	v0.2d, v0.2d, v0.2d
+trn1	v0.2s, v0.2s, v0.2s
+trn1	v0.4h, v0.4h, v0.4h
+trn1	v0.4s, v0.4s, v0.4s
+trn1	v0.8b, v0.8b, v0.8b
+trn1	v0.8h, v0.8h, v0.8h
+trn2	v0.16b, v0.16b, v0.16b
+trn2	v0.2d, v0.2d, v0.2d
+trn2	v0.2s, v0.2s, v0.2s
+trn2	v0.4h, v0.4h, v0.4h
+trn2	v0.4s, v0.4s, v0.4s
+trn2	v0.8b, v0.8b, v0.8b
+trn2	v0.8h, v0.8h, v0.8h
+uaba v0.8b, v0.8b, v0.8b
+uabal v0.2d, v0.2s, v0.2s
+uabal v0.4s, v0.4h, v0.4h
+uabal v0.8h, v0.8b, v0.8b
+uabal2 v0.2d, v0.4s, v0.4s
+uabal2 v0.4s, v0.8h, v0.8h
+uabal2 v0.8h, v0.16b, v0.16b
+uabd v0.4h, v0.4h, v0.4h
+uabdl v0.2d, v0.2s, v0.2s
+uabdl v0.4s, v0.4h, v0.4h
+uabdl v0.8h, v0.8b, v0.8b
+uabdl2 v0.2d, v0.4s, v0.4s
+uabdl2 v0.4s, v0.8h, v0.8h
+uabdl2 v0.8h, v0.16b, v0.16b
+uadalp v0.1d, v0.2s
+uadalp v0.2d, v0.4s
+uadalp v0.2s, v0.4h
+uadalp v0.4h, v0.8b
+uadalp v0.4s, v0.8h
+uadalp v0.8h, v0.16b
+uaddl v0.2d, v0.2s, v0.2s
+uaddl v0.4s, v0.4h, v0.4h
+uaddl v0.8h, v0.8b, v0.8b
+uaddl2 v0.2d, v0.4s, v0.4s
+uaddl2 v0.4s, v0.8h, v0.8h
+uaddl2 v0.8h, v0.16b, v0.16b
+uaddlp v0.1d, v0.2s
+uaddlp v0.2d, v0.4s
+uaddlp v0.2s, v0.4h
+uaddlp v0.4h, v0.8b
+uaddlp v0.4s, v0.8h
+uaddlp v0.8h, v0.16b
+uaddw v0.2d, v0.2d, v0.2s
+uaddw v0.4s, v0.4s, v0.4h
+uaddw v0.8h, v0.8h, v0.8b
+uaddw2 v0.2d, v0.2d, v0.4s
+uaddw2 v0.4s, v0.4s, v0.8h
+uaddw2 v0.8h, v0.8h, v0.16b
+ucvtf d21, d14
+ucvtf d21, d14, #64
+ucvtf s22, s13
+ucvtf s22, s13, #32
+ucvtf v0.2d, v0.2d
+ucvtf v0.2d, v0.2d, #3
+ucvtf v0.2s, v0.2s
+ucvtf v0.2s, v0.2s, #3
+ucvtf v0.4h, v0.4h
+ucvtf v0.4s, v0.4s
+ucvtf v0.4s, v0.4s, #3
+ucvtf v0.8h, v0.8h
+uhadd v0.16b, v0.16b, v0.16b
+uhadd v0.8h, v0.8h, v0.8h
+uhsub v0.4s, v0.4s, v0.4s
+umax v0.16b, v0.16b, v0.16b
+umax v0.4s, v0.4s, v0.4s
+umax v0.8h, v0.8h, v0.8h
+umaxp v0.16b, v0.16b, v0.16b
+umaxp v0.4s, v0.4s, v0.4s
+umaxp v0.8h, v0.8h, v0.8h
+umin v0.2s, v0.2s, v0.2s
+umin v0.4h, v0.4h, v0.4h
+umin v0.8b, v0.8b, v0.8b
+uminp v0.2s, v0.2s, v0.2s
+uminp v0.4h, v0.4h, v0.4h
+uminp v0.8b, v0.8b, v0.8b
+umlal v0.2d, v0.2s, v0.2s
+umlal v0.4s, v0.4h, v0.4h
+umlal v0.8h, v0.8b, v0.8b
+umlal2 v0.2d, v0.4s, v0.4s
+umlal2 v0.4s, v0.8h, v0.8h
+umlal2 v0.8h, v0.16b, v0.16b
+umlsl v0.2d, v0.2s, v0.2s
+umlsl v0.4s, v0.4h, v0.4h
+umlsl v0.8h, v0.8b, v0.8b
+umlsl2 v0.2d, v0.4s, v0.4s
+umlsl2 v0.4s, v0.8h, v0.8h
+umlsl2 v0.8h, v0.16b, v0.16b
+umull v0.2d, v0.2s, v0.2s
+umull v0.4s, v0.4h, v0.4h
+umull v0.8h, v0.8b, v0.8b
+umull2 v0.2d, v0.4s, v0.4s
+umull2 v0.4s, v0.8h, v0.8h
+umull2 v0.8h, v0.16b, v0.16b
+uqadd h0, h1, h5
+uqadd v0.8h, v0.8h, v0.8h
+uqrshl b11, b20, b30
+uqrshl s23, s20, s16
+uqrshl v0.16b, v0.16b, v0.16b
+uqrshl v0.4s, v0.4s, v0.4s
+uqrshl v0.4s, v0.4s, v0.4s
+uqrshl v0.8h, v0.8h, v0.8h
+uqrshrn b10, h12, #5
+uqrshrn h12, s10, #14
+uqrshrn s10, d10, #25
+uqrshrn v0.2s, v0.2d, #3
+uqrshrn v0.4h, v0.4s, #3
+uqrshrn v0.8b, v0.8h, #3
+uqrshrn2 v0.16b, v0.8h, #3
+uqrshrn2 v0.4s, v0.2d, #3
+uqrshrn2 v0.8h, v0.4s, #3
+uqshl b11, b20, b30
+uqshl b18, b15, #6
+uqshl d15, d12, #19
+uqshl h11, h18, #7
+uqshl s14, s19, #18
+uqshl s23, s20, s16
+uqshl v0.16b, v0.16b, #3
+uqshl v0.16b, v0.16b, v0.16b
+uqshl v0.2d, v0.2d, #3
+uqshl v0.2d, v0.2d, v0.2d
+uqshl v0.2s, v0.2s, #3
+uqshl v0.4h, v0.4h, #3
+uqshl v0.4s, v0.4s, #3
+uqshl v0.4s, v0.4s, v0.4s
+uqshl v0.8b, v0.8b, #3
+uqshl v0.8h, v0.8h, #3
+uqshl v0.8h, v0.8h, v0.8h
+uqshrn b12, h10, #7
+uqshrn h10, s14, #5
+uqshrn s10, d12, #13
+uqshrn v0.2s, v0.2d, #3
+uqshrn v0.4h, v0.4s, #3
+uqshrn v0.8b, v0.8h, #3
+uqshrn2 v0.16b, v0.8h, #3
+uqshrn2 v0.4s, v0.2d, #3
+uqshrn2 v0.8h, v0.4s, #3
+uqsub d16, d16, d16
+uqsub v0.4h, v0.4h, v0.4h
+uqxtn b18, h18
+uqxtn h20, s17
+uqxtn s19, d14
+uqxtn v0.2s, v0.2d
+uqxtn v0.4h, v0.4s
+uqxtn v0.8b, v0.8h
+uqxtn2 v0.16b, v0.8h
+uqxtn2 v0.4s, v0.2d
+uqxtn2 v0.8h, v0.4s
+urecpe v0.2s, v0.2s
+urecpe v0.4s, v0.4s
+urhadd v0.16b, v0.16b, v0.16b
+urhadd v0.4s, v0.4s, v0.4s
+urhadd v0.8h, v0.8h, v0.8h
+urshl d8, d7, d4
+urshl v0.16b, v0.16b, v0.16b
+urshl v0.2d, v0.2d, v0.2d
+urshl v0.4s, v0.4s, v0.4s
+urshl v0.8h, v0.8h, v0.8h
+urshr d20, d23, #31
+urshr v0.16b, v0.16b, #3
+urshr v0.2d, v0.2d, #3
+urshr v0.2s, v0.2s, #3
+urshr v0.4h, v0.4h, #3
+urshr v0.4s, v0.4s, #3
+urshr v0.8b, v0.8b, #3
+urshr v0.8h, v0.8h, #3
+ursqrte v0.2s, v0.2s
+ursqrte v0.4s, v0.4s
+ursra d18, d10, #13
+ursra v0.16b, v0.16b, #3
+ursra v0.2d, v0.2d, #3
+ursra v0.2s, v0.2s, #3
+ursra v0.4h, v0.4h, #3
+ursra v0.4s, v0.4s, #3
+ursra v0.8b, v0.8b, #3
+ursra v0.8h, v0.8h, #3
+ushl d0, d0, d0
+ushl v0.16b, v0.16b, v0.16b
+ushl v0.4s, v0.4s, v0.4s
+ushl v0.8h, v0.8h, v0.8h
+ushll v0.4s, v0.4h, #3
+ushll2 v0.8h, v0.16b, #3
+ushr d10, d17, #18
+ushr v0.16b, v0.16b, #3
+ushr v0.2d, v0.2d, #3
+ushr v0.2s, v0.2s, #3
+ushr v0.4h, v0.4h, #3
+ushr v0.4s, v0.4s, #3
+ushr v0.8b, v0.8b, #3
+ushr v0.8h, v0.8h, #3
+usqadd b19, b14
+usqadd d18, d22
+usqadd h20, h15
+usqadd s21, s12
+usqadd v0.16b, v0.16b
+usqadd v0.2d, v0.2d
+usqadd v0.2s, v0.2s
+usqadd v0.4h, v0.4h
+usqadd v0.4s, v0.4s
+usqadd v0.8b, v0.8b
+usqadd v0.8h, v0.8h
+usra d20, d13, #61
+usra v0.16b, v0.16b, #3
+usra v0.2d, v0.2d, #3
+usra v0.2s, v0.2s, #3
+usra v0.4h, v0.4h, #3
+usra v0.4s, v0.4s, #3
+usra v0.8b, v0.8b, #3
+usra v0.8h, v0.8h, #3
+usubl v0.2d, v0.2s, v0.2s
+usubl v0.4s, v0.4h, v0.4h
+usubl v0.8h, v0.8b, v0.8b
+usubl2 v0.2d, v0.4s, v0.4s
+usubl2 v0.4s, v0.8h, v0.8h
+usubl2 v0.8h, v0.16b, v0.16b
+usubw v0.2d, v0.2d, v0.2s
+usubw v0.4s, v0.4s, v0.4h
+usubw v0.8h, v0.8h, v0.8b
+usubw2 v0.2d, v0.2d, v0.4s
+usubw2 v0.4s, v0.4s, v0.8h
+usubw2 v0.8h, v0.8h, v0.16b
+uzp1	v0.16b, v0.16b, v0.16b
+uzp1	v0.2d, v0.2d, v0.2d
+uzp1	v0.2s, v0.2s, v0.2s
+uzp1	v0.4h, v0.4h, v0.4h
+uzp1	v0.4s, v0.4s, v0.4s
+uzp1	v0.8b, v0.8b, v0.8b
+uzp1	v0.8h, v0.8h, v0.8h
+uzp2	v0.16b, v0.16b, v0.16b
+uzp2	v0.2d, v0.2d, v0.2d
+uzp2	v0.2s, v0.2s, v0.2s
+uzp2	v0.4h, v0.4h, v0.4h
+uzp2	v0.4s, v0.4s, v0.4s
+uzp2	v0.8b, v0.8b, v0.8b
+uzp2	v0.8h, v0.8h, v0.8h
+xtn v0.2s, v0.2d
+xtn v0.4h, v0.4s
+xtn v0.8b, v0.8h
+xtn2 v0.16b, v0.8h
+xtn2 v0.4s, v0.2d
+xtn2 v0.8h, v0.4s
+zip1	v0.16b, v0.16b, v0.16b
+zip1	v0.2d, v0.2d, v0.2d
+zip1	v0.2s, v0.2s, v0.2s
+zip1	v0.4h, v0.4h, v0.4h
+zip1	v0.4s, v0.4s, v0.4s
+zip1	v0.8b, v0.8b, v0.8b
+zip1	v0.8h, v0.8h, v0.8h
+zip2	v0.16b, v0.16b, v0.16b
+zip2	v0.2d, v0.2d, v0.2d
+zip2	v0.2s, v0.2s, v0.2s
+zip2	v0.4h, v0.4h, v0.4h
+zip2	v0.4s, v0.4s, v0.4s
+zip2	v0.8b, v0.8b, v0.8b
+zip2	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50                        abs	d29, d24
+# CHECK-NEXT:  1      2     0.50                        abs	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        abs	v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        abs	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        abs	v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        abs	v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        abs	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        abs	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        add	d17, d31, d29
+# CHECK-NEXT:  1      2     0.50                        add	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  2      6     1.00                        addhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  2      6     1.00                        addhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  2      6     1.00                        addhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  2      6     1.00                        addhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  2      6     1.00                        addhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  2      6     1.00                        addhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        addp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        addp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        and	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        bic	v0.4h, #15, lsl #8
+# CHECK-NEXT:  1      2     0.50                        bic	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        bif	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        bit	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        bsl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        cls	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        cls	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        cls	v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        cls	v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        cls	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        cls	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        clz	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        clz	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        clz	v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        clz	v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        clz	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        clz	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        cmeq	d20, d21, #0
+# CHECK-NEXT:  1      2     0.50                        cmeq	d20, d21, d22
+# CHECK-NEXT:  1      2     0.50                        cmeq	v0.16b, v0.16b, #0
+# CHECK-NEXT:  1      2     0.50                        cmeq	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        cmge	d20, d21, #0
+# CHECK-NEXT:  1      2     0.50                        cmge	d20, d21, d22
+# CHECK-NEXT:  1      2     0.50                        cmge	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        cmge	v0.8b, v0.8b, #0
+# CHECK-NEXT:  1      2     0.50                        cmgt	d20, d21, #0
+# CHECK-NEXT:  1      2     0.50                        cmgt	d20, d21, d22
+# CHECK-NEXT:  1      2     0.50                        cmgt	v0.2s, v0.2s, #0
+# CHECK-NEXT:  1      2     0.50                        cmgt	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        cmhi	d20, d21, d22
+# CHECK-NEXT:  1      2     0.50                        cmhi	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        cmhs	d20, d21, d22
+# CHECK-NEXT:  1      2     0.50                        cmhs	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        cmle	d20, d21, #0
+# CHECK-NEXT:  1      2     0.50                        cmle	v0.2d, v0.2d, #0
+# CHECK-NEXT:  1      2     0.50                        cmlt	d20, d21, #0
+# CHECK-NEXT:  1      2     0.50                        cmlt	v0.8h, v0.8h, #0
+# CHECK-NEXT:  1      2     0.50                        cmtst	d20, d21, d22
+# CHECK-NEXT:  1      2     0.50                        cmtst	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        cnt	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        cnt	v0.8b, v0.8b
+# CHECK-NEXT:  1      5     1.00                        dup	v0.16b, w28
+# CHECK-NEXT:  1      5     1.00                        dup	v0.2d, x28
+# CHECK-NEXT:  1      5     1.00                        dup	v0.2s, w28
+# CHECK-NEXT:  1      5     1.00                        dup	v0.4h, w28
+# CHECK-NEXT:  1      5     1.00                        dup	v0.4s, w28
+# CHECK-NEXT:  1      5     1.00                        dup	v0.8b, w28
+# CHECK-NEXT:  1      5     1.00                        dup	v0.8h, w28
+# CHECK-NEXT:  1      2     0.50                        eor	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        ext	v0.16b, v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        ext	v0.8b, v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     0.50                        fabd	d29, d24, d20
+# CHECK-NEXT:  1      3     0.50                        fabd	s29, s24, s20
+# CHECK-NEXT:  1      3     0.50                        fabd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fabs	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fabs	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fabs	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fabs	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fabs	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        facge	d20, d21, d22
+# CHECK-NEXT:  1      3     0.50                        facge	s10, s11, s12
+# CHECK-NEXT:  1      3     0.50                        facge	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        facgt	d20, d21, d22
+# CHECK-NEXT:  1      3     0.50                        facgt	s10, s11, s12
+# CHECK-NEXT:  1      3     0.50                        facgt	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        faddp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        faddp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcmeq	d20, d21, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmeq	d20, d21, d22
+# CHECK-NEXT:  1      3     0.50                        fcmeq	s10, s11, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmeq	s10, s11, s12
+# CHECK-NEXT:  1      3     0.50                        fcmeq	v0.2s, v0.2s, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmeq	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcmge	d20, d21, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmge	d20, d21, d22
+# CHECK-NEXT:  1      3     0.50                        fcmge	s10, s11, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmge	s10, s11, s12
+# CHECK-NEXT:  1      3     0.50                        fcmge	v0.2d, v0.2d, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmge	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcmgt	d20, d21, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmgt	d20, d21, d22
+# CHECK-NEXT:  1      3     0.50                        fcmgt	s10, s11, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmgt	s10, s11, s12
+# CHECK-NEXT:  1      3     0.50                        fcmgt	v0.4s, v0.4s, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmgt	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcmle	d20, d21, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmle	s10, s11, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmle	v0.2d, v0.2d, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmlt	d20, d21, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmlt	s10, s11, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcmlt	v0.4s, v0.4s, #0.0
+# CHECK-NEXT:  1      3     0.50                        fcvtas	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtas	s12, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtas	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtas	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtas	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtas	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtas	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtau	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtau	s12, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtau	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtau	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtau	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtau	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtau	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtl	v0.2d, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtl	v0.4s, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtl2	v0.2d, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtl2	v0.4s, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtms	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtms	s22, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtms	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtms	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtms	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtms	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtms	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtmu	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtmu	s12, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtmu	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtmu	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtmu	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtmu	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtmu	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtn	v0.2s, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtn	v0.4h, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtn2	v0.4s, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtn2	v0.8h, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtns	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtns	s22, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtns	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtns	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtns	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtns	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtns	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtnu	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtnu	s12, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtnu	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtnu	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtnu	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtnu	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtnu	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtps	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtps	s22, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtps	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtps	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtps	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtps	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtps	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtpu	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtpu	s12, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtpu	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtpu	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtpu	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtpu	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtpu	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtxn	s22, d13
+# CHECK-NEXT:  1      3     0.50                        fcvtxn	v0.2s, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtxn2	v0.4s, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	d21, d12, #1
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	s12, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	s21, s12, #1
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        fcvtzs	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	d21, d12, #1
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	d21, d14
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	s12, s13
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	s21, s12, #1
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        fcvtzu	v0.8h, v0.8h
+# CHECK-NEXT:  1      12    1.00                        fdiv	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fmax	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fmax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fmax	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fmaxnm	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fmaxnm	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fmaxnm	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fmaxnmp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fmaxnmp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fmaxnmp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fmaxp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fmaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fmaxp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fmin	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fmin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fmin	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fminnm	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fminnm	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fminnm	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fminnmp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fminnmp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fminnmp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fminp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fminp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     0.50                        fmla	d0, d1, v0.d[1]
+# CHECK-NEXT:  1      4     0.50                        fmla	s0, s1, v0.s[3]
+# CHECK-NEXT:  1      4     0.50                        fmla	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     0.50                        fmls	d0, d4, v0.d[1]
+# CHECK-NEXT:  1      4     0.50                        fmls	s3, s5, v0.s[3]
+# CHECK-NEXT:  1      4     0.50                        fmls	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        fmov	v0.2d, #-1.25000000
+# CHECK-NEXT:  1      2     0.50                        fmov	v0.2s, #13.00000000
+# CHECK-NEXT:  1      2     0.50                        fmov	v0.4s, #1.00000000
+# CHECK-NEXT:  1      4     0.50                        fmul	d0, d1, v0.d[1]
+# CHECK-NEXT:  1      4     0.50                        fmul	s0, s1, v0.s[3]
+# CHECK-NEXT:  1      4     0.50                        fmul	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     0.50                        fmulx	d0, d4, v0.d[1]
+# CHECK-NEXT:  1      4     0.50                        fmulx	d23, d11, d1
+# CHECK-NEXT:  1      4     0.50                        fmulx	s20, s22, s15
+# CHECK-NEXT:  1      4     0.50                        fmulx	s3, s5, v0.s[3]
+# CHECK-NEXT:  1      4     0.50                        fmulx	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     0.50                        fmulx	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     0.50                        fmulx	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fneg	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        fneg	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        fneg	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        fneg	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        fneg	v0.8h, v0.8h
+# CHECK-NEXT:  2      6     1.00                        frecpe	d13, d13
+# CHECK-NEXT:  2      6     1.00                        frecpe	s19, s14
+# CHECK-NEXT:  2      6     1.00                        frecpe	v0.2d, v0.2d
+# CHECK-NEXT:  2      6     1.00                        frecpe	v0.2s, v0.2s
+# CHECK-NEXT:  2      6     1.00                        frecpe	v0.4h, v0.4h
+# CHECK-NEXT:  2      6     1.00                        frecpe	v0.4s, v0.4s
+# CHECK-NEXT:  2      6     1.00                        frecpe	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        frecps	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        frecps	d22, d30, d21
+# CHECK-NEXT:  1      3     0.50                        frecps	s21, s16, s13
+# CHECK-NEXT:  1      3     0.50                        frecpx	d16, d19
+# CHECK-NEXT:  1      3     0.50                        frecpx	s18, s10
+# CHECK-NEXT:  1      3     0.50                        frinta	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        frinta	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        frinta	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        frinta	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        frinta	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        frinti	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        frinti	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        frinti	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        frinti	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        frinti	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        frintm	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        frintm	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        frintm	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        frintm	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        frintm	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        frintn	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        frintn	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        frintn	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        frintn	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        frintn	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        frintp	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        frintp	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        frintp	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        frintp	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        frintp	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        frintx	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        frintx	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        frintx	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        frintx	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        frintx	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        frintz	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        frintz	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        frintz	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        frintz	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        frintz	v0.8h, v0.8h
+# CHECK-NEXT:  2      6     1.00                        frsqrte	d21, d12
+# CHECK-NEXT:  2      6     1.00                        frsqrte	s22, s13
+# CHECK-NEXT:  2      6     1.00                        frsqrte	v0.2d, v0.2d
+# CHECK-NEXT:  2      6     1.00                        frsqrte	v0.2s, v0.2s
+# CHECK-NEXT:  2      6     1.00                        frsqrte	v0.4h, v0.4h
+# CHECK-NEXT:  2      6     1.00                        frsqrte	v0.4s, v0.4s
+# CHECK-NEXT:  2      6     1.00                        frsqrte	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        frsqrts	d8, d22, d18
+# CHECK-NEXT:  1      3     0.50                        frsqrts	s21, s5, s12
+# CHECK-NEXT:  1      3     0.50                        frsqrts	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      63    1.00                        fsqrt	v0.2d, v0.2d
+# CHECK-NEXT:  1      33    1.00                        fsqrt	v0.2s, v0.2s
+# CHECK-NEXT:  1      39    1.00                        fsqrt	v0.4h, v0.4h
+# CHECK-NEXT:  1      33    1.00                        fsqrt	v0.4s, v0.4s
+# CHECK-NEXT:  1      39    1.00                        fsqrt	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        fsub	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     0.50    *                   ld1	{ v0.16b }, [x0]
+# CHECK-NEXT:  3      5     1.50    *                   ld1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK-NEXT:  4      5     2.00    *                   ld1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+# CHECK-NEXT:  2      4     1.00    *                   ld1	{ v0.4s, v1.4s }, [sp], #32
+# CHECK-NEXT:  3      5     1.50    *                   ld1	{ v0.4s, v1.4s, v2.4s }, [sp]
+# CHECK-NEXT:  4      5     2.00    *                   ld1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+# CHECK-NEXT:  1      4     0.50    *                   ld1	{ v0.8h }, [x15], x2
+# CHECK-NEXT:  2      4     1.00    *                   ld1	{ v0.8h, v1.8h }, [x15]
+# CHECK-NEXT:  2      6     0.50    *                   ld1	{ v0.b }[9], [x0]
+# CHECK-NEXT:  2      6     0.50    *                   ld1	{ v0.b }[9], [x0], #1
+# CHECK-NEXT:  2      6     0.50    *                   ld1r	{ v0.16b }, [x0]
+# CHECK-NEXT:  2      6     0.50    *                   ld1r	{ v0.16b }, [x0], #1
+# CHECK-NEXT:  2      6     0.50    *                   ld1r	{ v0.8h }, [x15]
+# CHECK-NEXT:  2      6     0.50    *                   ld1r	{ v0.8h }, [x15], #2
+# CHECK-NEXT:  4      6     1.00    *                   ld2	{ v0.16b, v1.16b }, [x0], x1
+# CHECK-NEXT:  5      8     1.50    *                   ld2	{ v0.8b, v1.8b }, [x0]
+# CHECK-NEXT:  4      6     1.00    *                   ld2	{ v0.h, v1.h }[7], [x15]
+# CHECK-NEXT:  4      6     1.00    *                   ld2	{ v0.h, v1.h }[7], [x15], #4
+# CHECK-NEXT:  4      6     1.00    *                   ld2r	{ v0.2d, v1.2d }, [x0]
+# CHECK-NEXT:  4      6     1.00    *                   ld2r	{ v0.2d, v1.2d }, [x0], #16
+# CHECK-NEXT:  4      6     1.00    *                   ld2r	{ v0.4s, v1.4s }, [sp]
+# CHECK-NEXT:  4      6     1.00    *                   ld2r	{ v0.4s, v1.4s }, [sp], #8
+# CHECK-NEXT:  6      9     1.50    *                   ld3	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  6      8     1.50    *                   ld3	{ v0.8h, v1.8h, v2.8h }, [x15], x2
+# CHECK-NEXT:  6      7     1.50    *                   ld3	{ v0.s, v1.s, v2.s }[3], [sp]
+# CHECK-NEXT:  6      7     1.50    *                   ld3	{ v0.s, v1.s, v2.s }[3], [sp], x3
+# CHECK-NEXT:  6      7     1.50    *                   ld3r	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  6      7     1.50    *                   ld3r	{ v0.4h, v1.4h, v2.4h }, [x15], #6
+# CHECK-NEXT:  6      7     1.50    *                   ld3r	{ v0.8b, v1.8b, v2.8b }, [x0]
+# CHECK-NEXT:  6      7     1.50    *                   ld3r	{ v0.8b, v1.8b, v2.8b }, [x0], #3
+# CHECK-NEXT:  12     11    2.00    *                   ld4	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  12     10    2.00    *                   ld4	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+# CHECK-NEXT:  8      7     2.00    *                   ld4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0]
+# CHECK-NEXT:  8      7     2.00    *                   ld4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0], #32
+# CHECK-NEXT:  8      7     2.00    *                   ld4	{ v0.h, v1.h, v2.h, v3.h }[7], [x0], x0
+# CHECK-NEXT:  4      5     2.00    *                   ld4r	{ v0.1d, v1.1d, v2.1d, v3.1d }, [sp]
+# CHECK-NEXT:  4      5     2.00    *                   ld4r	{ v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7
+# CHECK-NEXT:  8      7     2.00    *                   ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  8      7     2.00    *                   ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30
+# CHECK-NEXT:  1      3     0.50                        mla	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        mls	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        mov	b0, v0.b[15]
+# CHECK-NEXT:  1      3     0.50                        mov	d6, v0.d[1]
+# CHECK-NEXT:  1      3     0.50                        mov	h2, v0.h[5]
+# CHECK-NEXT:  1      3     0.50                        mov	s17, v0.s[2]
+# CHECK-NEXT:  1      2     0.50                        mov	v2.b[0], v0.b[0]
+# CHECK-NEXT:  1      2     0.50                        mov	v2.h[1], v0.h[1]
+# CHECK-NEXT:  1      2     0.50                        mov	v2.s[2], v0.s[2]
+# CHECK-NEXT:  1      2     0.50                        mov	v2.d[1], v0.d[1]
+# CHECK-NEXT:  2      7     1.00                        mov	v0.b[0], w8
+# CHECK-NEXT:  2      7     1.00                        mov	v0.h[1], w8
+# CHECK-NEXT:  2      7     1.00                        mov	v0.s[2], w8
+# CHECK-NEXT:  2      7     1.00                        mov	v0.d[1], x8
+# CHECK-NEXT:  1      2     0.50                        mov	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        mov	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        movi	d15, #0xff00ff00ff00ff
+# CHECK-NEXT:  1      2     0.50                        movi	v0.16b, #31
+# CHECK-NEXT:  1      2     0.50                        movi	v0.2d, #0xff0000ff0000ffff
+# CHECK-NEXT:  1      2     0.50                        movi	v0.2s, #8, msl #8
+# CHECK-NEXT:  1      2     0.50                        movi	v0.4s, #255, lsl #24
+# CHECK-NEXT:  1      2     0.50                        movi	v0.8b, #255
+# CHECK-NEXT:  1      3     0.50                        mul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        mvni	v0.2s, #0
+# CHECK-NEXT:  1      2     0.50                        mvni	v0.4s, #16, msl #16
+# CHECK-NEXT:  1      3     0.50                        neg	d29, d24
+# CHECK-NEXT:  1      3     0.50                        neg	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        neg	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        neg	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        neg	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        neg	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        neg	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        neg	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        mvn	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        mvn	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        orn	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        mov	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        orr	v0.8h, #31
+# CHECK-NEXT:  1      2     0.50                        pmul	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        pmul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        pmull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        pmull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  2      6     1.00                        raddhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  2      6     1.00                        raddhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  2      6     1.00                        raddhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  2      6     1.00                        raddhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  2      6     1.00                        raddhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  2      6     1.00                        raddhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        rbit	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        rbit	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        rev16	v21.8b, v1.8b
+# CHECK-NEXT:  1      2     0.50                        rev16	v30.16b, v31.16b
+# CHECK-NEXT:  1      2     0.50                        rev32	v0.4h, v9.4h
+# CHECK-NEXT:  1      2     0.50                        rev32	v21.8b, v1.8b
+# CHECK-NEXT:  1      2     0.50                        rev32	v30.16b, v31.16b
+# CHECK-NEXT:  1      2     0.50                        rev32	v4.8h, v7.8h
+# CHECK-NEXT:  1      2     0.50                        rev64	v0.16b, v31.16b
+# CHECK-NEXT:  1      2     0.50                        rev64	v1.8b, v9.8b
+# CHECK-NEXT:  1      2     0.50                        rev64	v13.4h, v21.4h
+# CHECK-NEXT:  1      2     0.50                        rev64	v2.8h, v4.8h
+# CHECK-NEXT:  1      2     0.50                        rev64	v4.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        rev64	v6.4s, v8.4s
+# CHECK-NEXT:  2      6     1.00                        rshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  2      6     1.00                        rshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  2      6     1.00                        rshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  2      6     1.00                        rshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  2      6     1.00                        rshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  2      6     1.00                        rshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  2      6     1.00                        rsubhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  2      6     1.00                        rsubhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  2      6     1.00                        rsubhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  2      6     1.00                        rsubhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  2      6     1.00                        rsubhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  2      6     1.00                        rsubhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        saba	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        sabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        sabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        sabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        sabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        sabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        sabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        sadalp	v0.1d, v0.2s
+# CHECK-NEXT:  1      2     0.50                        sadalp	v0.2d, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sadalp	v0.2s, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sadalp	v0.4h, v0.8b
+# CHECK-NEXT:  1      2     0.50                        sadalp	v0.4s, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sadalp	v0.8h, v0.16b
+# CHECK-NEXT:  1      2     0.50                        saddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        saddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        saddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        saddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        saddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        saddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        saddlp	v0.1d, v0.2s
+# CHECK-NEXT:  1      2     0.50                        saddlp	v0.2d, v0.4s
+# CHECK-NEXT:  1      2     0.50                        saddlp	v0.2s, v0.4h
+# CHECK-NEXT:  1      2     0.50                        saddlp	v0.4h, v0.8b
+# CHECK-NEXT:  1      2     0.50                        saddlp	v0.4s, v0.8h
+# CHECK-NEXT:  1      2     0.50                        saddlp	v0.8h, v0.16b
+# CHECK-NEXT:  1      2     0.50                        saddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      2     0.50                        saddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      2     0.50                        saddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      2     0.50                        saddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      2     0.50                        saddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      2     0.50                        saddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        scvtf	d21, d12
+# CHECK-NEXT:  1      3     0.50                        scvtf	d21, d12, #64
+# CHECK-NEXT:  1      3     0.50                        scvtf	s22, s13
+# CHECK-NEXT:  1      3     0.50                        scvtf	s22, s13, #32
+# CHECK-NEXT:  1      3     0.50                        scvtf	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        scvtf	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        scvtf	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        scvtf	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        scvtf	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        scvtf	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        scvtf	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        scvtf	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        shadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        shl	d7, d10, #12
+# CHECK-NEXT:  1      3     0.50                        shl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        shl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        shl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        shl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  1      3     0.50                        shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  1      3     0.50                        shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  1      3     0.50                        shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  1      3     0.50                        shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  1      3     0.50                        shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  1      3     0.50                        shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  1      3     0.50                        shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  1      3     0.50                        shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  1      3     0.50                        shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  1      3     0.50                        shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  1      3     0.50                        shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  2      6     1.00                        shrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  2      6     1.00                        shrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  2      6     1.00                        shrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  2      6     1.00                        shrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  2      6     1.00                        shrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  2      6     1.00                        shrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        shsub	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        shsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        sli	d10, d14, #12
+# CHECK-NEXT:  1      3     0.50                        sli	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        sli	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        sli	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        sli	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        sli	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        sli	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     0.50                        sli	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        smax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        smax	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        smax	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        smaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        smaxp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        smaxp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        smin	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        smin	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        smin	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sminp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        sminp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sminp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        smlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        smlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        smlal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        smlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        smlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        smlal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        smlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        smlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        smlsl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        smlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        smlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        smlsl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        smull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        smull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        smull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        smull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        smull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        smull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        sqabs	b19, b14
+# CHECK-NEXT:  1      2     0.50                        sqabs	d18, d12
+# CHECK-NEXT:  1      2     0.50                        sqabs	h21, h15
+# CHECK-NEXT:  1      2     0.50                        sqabs	s20, s12
+# CHECK-NEXT:  1      2     0.50                        sqabs	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        sqabs	v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        sqabs	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        sqabs	v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sqabs	v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sqabs	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        sqabs	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sqadd	b20, b11, b15
+# CHECK-NEXT:  1      2     0.50                        sqadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        sqadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        sqdmlal	d19, s24, s12
+# CHECK-NEXT:  1      3     0.50                        sqdmlal	d8, s9, v0.s[1]
+# CHECK-NEXT:  1      3     0.50                        sqdmlal	s0, h0, v0.h[3]
+# CHECK-NEXT:  1      3     0.50                        sqdmlal	s17, h27, h12
+# CHECK-NEXT:  1      3     0.50                        sqdmlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        sqdmlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        sqdmlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        sqdmlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  1      3     0.50                        sqdmlsl	d8, s9, v0.s[1]
+# CHECK-NEXT:  1      3     0.50                        sqdmlsl	s0, h0, v0.h[3]
+# CHECK-NEXT:  1      3     0.50                        sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  1      3     0.50                        sqdmlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        sqdmlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        sqdmlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        sqdmlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        sqdmulh	h10, h11, h12
+# CHECK-NEXT:  1      3     0.50                        sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      3     0.50                        sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      3     0.50                        sqdmulh	s20, s21, s2
+# CHECK-NEXT:  1      3     0.50                        sqdmulh	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        sqdmulh	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        sqdmull	d1, s1, v0.s[1]
+# CHECK-NEXT:  1      3     0.50                        sqdmull	d15, s22, s12
+# CHECK-NEXT:  1      3     0.50                        sqdmull	s1, h1, v0.h[3]
+# CHECK-NEXT:  1      3     0.50                        sqdmull	s12, h22, h12
+# CHECK-NEXT:  1      3     0.50                        sqdmull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        sqdmull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        sqdmull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        sqdmull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sqneg	b19, b14
+# CHECK-NEXT:  1      2     0.50                        sqneg	d18, d12
+# CHECK-NEXT:  1      2     0.50                        sqneg	h21, h15
+# CHECK-NEXT:  1      2     0.50                        sqneg	s20, s12
+# CHECK-NEXT:  1      2     0.50                        sqneg	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        sqneg	v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        sqneg	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        sqneg	v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sqneg	v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sqneg	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        sqneg	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        sqrdmulh	h10, h11, h12
+# CHECK-NEXT:  1      3     0.50                        sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      3     0.50                        sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      3     0.50                        sqrdmulh	s20, s21, s2
+# CHECK-NEXT:  1      3     0.50                        sqrdmulh	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        sqrdmulh	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sqrshl	d31, d31, d31
+# CHECK-NEXT:  1      2     0.50                        sqrshl	h3, h4, h15
+# CHECK-NEXT:  1      2     0.50                        sqrshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        sqrshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sqrshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        sqrshrn	b10, h13, #2
+# CHECK-NEXT:  1      3     0.50                        sqrshrn	h15, s10, #6
+# CHECK-NEXT:  1      3     0.50                        sqrshrn	s15, d12, #9
+# CHECK-NEXT:  1      2     0.50                        sqrshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        sqrshrun	b17, h10, #6
+# CHECK-NEXT:  1      3     0.50                        sqrshrun	h10, s13, #15
+# CHECK-NEXT:  1      3     0.50                        sqrshrun	s22, d16, #31
+# CHECK-NEXT:  1      2     0.50                        sqrshrun	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrun	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrun	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrun2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrun2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        sqrshrun2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        sqshl	b11, b19, #7
+# CHECK-NEXT:  1      2     0.50                        sqshl	d15, d16, #51
+# CHECK-NEXT:  1      2     0.50                        sqshl	d31, d31, d31
+# CHECK-NEXT:  1      2     0.50                        sqshl	h13, h18, #11
+# CHECK-NEXT:  1      2     0.50                        sqshl	h3, h4, h15
+# CHECK-NEXT:  1      2     0.50                        sqshl	s14, s17, #22
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        sqshl	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        sqshlu	b15, b18, #6
+# CHECK-NEXT:  1      2     0.50                        sqshlu	d11, d13, #32
+# CHECK-NEXT:  1      2     0.50                        sqshlu	h19, h17, #6
+# CHECK-NEXT:  1      2     0.50                        sqshlu	s16, s14, #25
+# CHECK-NEXT:  1      2     0.50                        sqshlu	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        sqshlu	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        sqshlu	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      2     0.50                        sqshlu	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        sqshlu	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        sqshlu	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      2     0.50                        sqshlu	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        sqshrn	b10, h15, #5
+# CHECK-NEXT:  1      3     0.50                        sqshrn	h17, s10, #4
+# CHECK-NEXT:  1      3     0.50                        sqshrn	s18, d10, #31
+# CHECK-NEXT:  2      6     1.00                        sqshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        sqshrun	b15, h10, #7
+# CHECK-NEXT:  1      3     0.50                        sqshrun	h20, s14, #3
+# CHECK-NEXT:  1      3     0.50                        sqshrun	s10, d15, #15
+# CHECK-NEXT:  2      6     1.00                        sqshrun	v0.2s, v0.2d, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrun	v0.4h, v0.4s, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrun	v0.8b, v0.8h, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrun2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrun2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  2      6     1.00                        sqshrun2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        sqsub	s20, s10, s7
+# CHECK-NEXT:  1      2     0.50                        sqsub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        sqsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sqsub	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        sqxtn	b18, h18
+# CHECK-NEXT:  1      2     0.50                        sqxtn	h20, s17
+# CHECK-NEXT:  1      2     0.50                        sqxtn	s19, d14
+# CHECK-NEXT:  1      2     0.50                        sqxtn	v0.2s, v0.2d
+# CHECK-NEXT:  1      2     0.50                        sqxtn	v0.4h, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sqxtn	v0.8b, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sqxtn2	v0.16b, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sqxtn2	v0.4s, v0.2d
+# CHECK-NEXT:  1      2     0.50                        sqxtn2	v0.8h, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sqxtun	b19, h14
+# CHECK-NEXT:  1      2     0.50                        sqxtun	h21, s15
+# CHECK-NEXT:  1      2     0.50                        sqxtun	s20, d12
+# CHECK-NEXT:  1      2     0.50                        sqxtun	v0.2s, v0.2d
+# CHECK-NEXT:  1      2     0.50                        sqxtun	v0.4h, v0.4s
+# CHECK-NEXT:  1      2     0.50                        sqxtun	v0.8b, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sqxtun2	v0.16b, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sqxtun2	v0.4s, v0.2d
+# CHECK-NEXT:  1      2     0.50                        sqxtun2	v0.8h, v0.4s
+# CHECK-NEXT:  1      2     0.50                        srhadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        srhadd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        srhadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        sri	d10, d12, #14
+# CHECK-NEXT:  1      3     0.50                        sri	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        sri	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        sri	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        sri	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        sri	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        sri	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     0.50                        sri	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        srshl	d16, d16, d16
+# CHECK-NEXT:  1      3     0.50                        srshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        srshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        srshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        srshr	d19, d18, #7
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        srsra	d15, d11, #19
+# CHECK-NEXT:  1      2     0.50                        srsra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        srsra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        srsra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      2     0.50                        srsra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        srsra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        srsra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      2     0.50                        srsra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        sshl	d31, d31, d31
+# CHECK-NEXT:  1      2     0.50                        sshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        sshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        sshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        sshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        sshll	v0.2d, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        sshll2	v0.4s, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        sshr	d15, d16, #12
+# CHECK-NEXT:  1      3     0.50                        sshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        sshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        sshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        sshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        sshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        sshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     0.50                        sshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	d18, d12, #21
+# CHECK-NEXT:  1      2     0.50                        ssra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        ssra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        ssra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      2     0.50                        ssra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        ssra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        ssra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      2     0.50                        ssra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        ssubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        ssubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        ssubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        ssubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        ssubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        ssubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        ssubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      2     0.50                        ssubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      2     0.50                        ssubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      2     0.50                        ssubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      2     0.50                        ssubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      2     0.50                        ssubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  2      2     1.00           *            st1	{ v0.16b }, [x0]
+# CHECK-NEXT:  6      4     3.00           *            st1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK-NEXT:  8      5     4.00           *            st1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+# CHECK-NEXT:  4      3     2.00           *            st1	{ v0.4s, v1.4s }, [sp], #32
+# CHECK-NEXT:  6      4     3.00           *            st1	{ v0.4s, v1.4s, v2.4s }, [sp]
+# CHECK-NEXT:  8      5     4.00           *            st1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+# CHECK-NEXT:  2      2     1.00           *            st1	{ v0.8h }, [x15], x2
+# CHECK-NEXT:  4      3     2.00           *            st1	{ v0.8h, v1.8h }, [x15]
+# CHECK-NEXT:  3      4     1.00           *            st1	{ v0.d }[1], [x0]
+# CHECK-NEXT:  3      4     1.00           *            st1	{ v0.d }[1], [x0], #8
+# CHECK-NEXT:  6      5     2.00           *            st2	{ v0.16b, v1.16b }, [x0], x1
+# CHECK-NEXT:  6      6     2.00           *            st2	{ v0.8b, v1.8b }, [x0]
+# CHECK-NEXT:  6      5     2.00           *            st2	{ v0.s, v1.s }[3], [sp]
+# CHECK-NEXT:  6      5     2.00           *            st2	{ v0.s, v1.s }[3], [sp], #8
+# CHECK-NEXT:  9      6     3.00           *            st3	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  9      6     3.00           *            st3	{ v0.8h, v1.8h, v2.8h }, [x15], x2
+# CHECK-NEXT:  9      6     3.00           *            st3	{ v0.h, v1.h, v2.h }[7], [x15]
+# CHECK-NEXT:  9      6     3.00           *            st3	{ v0.h, v1.h, v2.h }[7], [x15], #6
+# CHECK-NEXT:  14     9     4.00           *            st4	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  12     7     4.00           *            st4	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+# CHECK-NEXT:  12     7     4.00           *            st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0]
+# CHECK-NEXT:  12     7     4.00           *            st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
+# CHECK-NEXT:  1      2     0.50                        sub	d15, d5, d16
+# CHECK-NEXT:  1      2     0.50                        sub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        suqadd	b19, b14
+# CHECK-NEXT:  1      2     0.50                        suqadd	d18, d22
+# CHECK-NEXT:  1      2     0.50                        suqadd	h20, h15
+# CHECK-NEXT:  1      2     0.50                        suqadd	s21, s12
+# CHECK-NEXT:  1      2     0.50                        suqadd	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        suqadd	v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        suqadd	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        suqadd	v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        suqadd	v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        suqadd	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        suqadd	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        tbl	v0.16b, { v0.16b }, v0.16b
+# CHECK-NEXT:  2      4     1.00                        tbl	v0.16b, { v0.16b, v1.16b }, v0.16b
+# CHECK-NEXT:  3      6     1.50                        tbl	v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+# CHECK-NEXT:  4      8     2.00                        tbl	v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+# CHECK-NEXT:  1      2     0.50                        tbl	v0.8b, { v0.16b }, v0.8b
+# CHECK-NEXT:  2      4     1.00                        tbl	v0.8b, { v0.16b, v1.16b }, v0.8b
+# CHECK-NEXT:  3      6     1.50                        tbl	v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+# CHECK-NEXT:  4      8     2.00                        tbl	v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+# CHECK-NEXT:  1      2     0.50                        tbx	v0.16b, { v0.16b }, v0.16b
+# CHECK-NEXT:  2      4     1.00                        tbx	v0.16b, { v0.16b, v1.16b }, v0.16b
+# CHECK-NEXT:  3      6     1.50                        tbx	v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+# CHECK-NEXT:  4      8     2.00                        tbx	v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+# CHECK-NEXT:  1      2     0.50                        tbx	v0.8b, { v0.16b }, v0.8b
+# CHECK-NEXT:  2      4     1.00                        tbx	v0.8b, { v0.16b, v1.16b }, v0.8b
+# CHECK-NEXT:  3      6     1.50                        tbx	v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+# CHECK-NEXT:  4      8     2.00                        tbx	v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+# CHECK-NEXT:  1      2     0.50                        trn1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        trn1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        trn1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        trn1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        trn1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        trn1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        trn1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        trn2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        trn2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        trn2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        trn2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        trn2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        trn2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        trn2	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uaba	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uadalp	v0.1d, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uadalp	v0.2d, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uadalp	v0.2s, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uadalp	v0.4h, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uadalp	v0.4s, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uadalp	v0.8h, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uaddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uaddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uaddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uaddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uaddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uaddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uaddlp	v0.1d, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uaddlp	v0.2d, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uaddlp	v0.2s, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uaddlp	v0.4h, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uaddlp	v0.4s, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uaddlp	v0.8h, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uaddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uaddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uaddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uaddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uaddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uaddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        ucvtf	d21, d14
+# CHECK-NEXT:  1      3     0.50                        ucvtf	d21, d14, #64
+# CHECK-NEXT:  1      3     0.50                        ucvtf	s22, s13
+# CHECK-NEXT:  1      3     0.50                        ucvtf	s22, s13, #32
+# CHECK-NEXT:  1      3     0.50                        ucvtf	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        ucvtf	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        ucvtf	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        ucvtf	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        ucvtf	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        ucvtf	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        ucvtf	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        ucvtf	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uhsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        umax	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        umax	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        umax	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        umaxp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        umaxp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        umaxp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        umin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        umin	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        umin	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uminp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uminp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        umlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        umlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        umlal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        umlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        umlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        umlal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        umlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        umlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        umlsl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        umlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        umlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        umlsl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        umull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        umull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        umull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        umull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        umull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        umull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uqadd	h0, h1, h5
+# CHECK-NEXT:  1      2     0.50                        uqadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uqrshl	b11, b20, b30
+# CHECK-NEXT:  1      2     0.50                        uqrshl	s23, s20, s16
+# CHECK-NEXT:  1      2     0.50                        uqrshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uqrshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uqrshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uqrshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        uqrshrn	b10, h12, #5
+# CHECK-NEXT:  1      3     0.50                        uqrshrn	h12, s10, #14
+# CHECK-NEXT:  1      3     0.50                        uqrshrn	s10, d10, #25
+# CHECK-NEXT:  1      2     0.50                        uqrshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        uqrshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        uqrshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        uqrshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        uqrshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        uqrshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        uqshl	b11, b20, b30
+# CHECK-NEXT:  1      2     0.50                        uqshl	b18, b15, #6
+# CHECK-NEXT:  1      2     0.50                        uqshl	d15, d12, #19
+# CHECK-NEXT:  1      2     0.50                        uqshl	h11, h18, #7
+# CHECK-NEXT:  1      2     0.50                        uqshl	s14, s19, #18
+# CHECK-NEXT:  1      2     0.50                        uqshl	s23, s20, s16
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        uqshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        uqshrn	b12, h10, #7
+# CHECK-NEXT:  1      3     0.50                        uqshrn	h10, s14, #5
+# CHECK-NEXT:  1      3     0.50                        uqshrn	s10, d12, #13
+# CHECK-NEXT:  1      2     0.50                        uqshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        uqshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        uqshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        uqshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        uqshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        uqshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        uqsub	d16, d16, d16
+# CHECK-NEXT:  1      2     0.50                        uqsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  2      6     1.00                        uqxtn	b18, h18
+# CHECK-NEXT:  2      6     1.00                        uqxtn	h20, s17
+# CHECK-NEXT:  2      6     1.00                        uqxtn	s19, d14
+# CHECK-NEXT:  2      6     1.00                        uqxtn	v0.2s, v0.2d
+# CHECK-NEXT:  2      6     1.00                        uqxtn	v0.4h, v0.4s
+# CHECK-NEXT:  2      6     1.00                        uqxtn	v0.8b, v0.8h
+# CHECK-NEXT:  2      6     1.00                        uqxtn2	v0.16b, v0.8h
+# CHECK-NEXT:  2      6     1.00                        uqxtn2	v0.4s, v0.2d
+# CHECK-NEXT:  2      6     1.00                        uqxtn2	v0.8h, v0.4s
+# CHECK-NEXT:  1      2     0.50                        urecpe	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        urecpe	v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        urhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        urhadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        urhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        urshl	d8, d7, d4
+# CHECK-NEXT:  1      3     0.50                        urshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        urshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        urshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        urshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        urshr	d20, d23, #31
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        ursqrte	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        ursqrte	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        ursra	d18, d10, #13
+# CHECK-NEXT:  1      2     0.50                        ursra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        ursra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        ursra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      2     0.50                        ursra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        ursra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        ursra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      2     0.50                        ursra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        ushl	d0, d0, d0
+# CHECK-NEXT:  1      2     0.50                        ushl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        ushl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        ushl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        ushll	v0.4s, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        ushll2	v0.8h, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        ushr	d10, d17, #18
+# CHECK-NEXT:  1      3     0.50                        ushr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        ushr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        ushr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        ushr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        ushr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        ushr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     0.50                        ushr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        usqadd	b19, b14
+# CHECK-NEXT:  1      2     0.50                        usqadd	d18, d22
+# CHECK-NEXT:  1      2     0.50                        usqadd	h20, h15
+# CHECK-NEXT:  1      2     0.50                        usqadd	s21, s12
+# CHECK-NEXT:  1      2     0.50                        usqadd	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        usqadd	v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        usqadd	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        usqadd	v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        usqadd	v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        usqadd	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        usqadd	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        usra	d20, d13, #61
+# CHECK-NEXT:  1      2     0.50                        usra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        usra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        usra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      2     0.50                        usra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        usra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        usra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      2     0.50                        usra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        usubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        usubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        usubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        usubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        usubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        usubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        usubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      2     0.50                        usubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      2     0.50                        usubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      2     0.50                        usubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      2     0.50                        usubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      2     0.50                        usubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uzp1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uzp1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        uzp1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uzp1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uzp1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uzp1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uzp1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        uzp2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        uzp2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        uzp2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uzp2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uzp2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        uzp2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uzp2	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        xtn	v0.2s, v0.2d
+# CHECK-NEXT:  1      2     0.50                        xtn	v0.4h, v0.4s
+# CHECK-NEXT:  1      2     0.50                        xtn	v0.8b, v0.8h
+# CHECK-NEXT:  1      2     0.50                        xtn2	v0.16b, v0.8h
+# CHECK-NEXT:  1      2     0.50                        xtn2	v0.4s, v0.2d
+# CHECK-NEXT:  1      2     0.50                        xtn2	v0.8h, v0.4s
+# CHECK-NEXT:  1      2     0.50                        zip1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        zip1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        zip1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        zip1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        zip1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        zip1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        zip1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        zip2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        zip2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        zip2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        zip2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        zip2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        zip2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        zip2	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - Ampere1BUnitA
+# CHECK-NEXT: [0.1] - Ampere1BUnitA
+# CHECK-NEXT: [1.0] - Ampere1BUnitB
+# CHECK-NEXT: [1.1] - Ampere1BUnitB
+# CHECK-NEXT: [2]   - Ampere1BUnitBS
+# CHECK-NEXT: [3.0] - Ampere1BUnitL
+# CHECK-NEXT: [3.1] - Ampere1BUnitL
+# CHECK-NEXT: [4.0] - Ampere1BUnitS
+# CHECK-NEXT: [4.1] - Ampere1BUnitS
+# CHECK-NEXT: [5]   - Ampere1BUnitX
+# CHECK-NEXT: [6]   - Ampere1BUnitY
+# CHECK-NEXT: [7]   - Ampere1BUnitZ
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6]    [7]
+# CHECK-NEXT:  -      -      -      -     11.00  51.00  51.00  29.00  29.00  604.50 584.50 58.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     abs	d29, d24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     abs	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     abs	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     abs	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     abs	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     abs	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     abs	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     abs	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     add	d17, d31, d29
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     add	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     addhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     addhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     addhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     addhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     addhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     addhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     addp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     addp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     and	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     bic	v0.4h, #15, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     bic	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     bif	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     bit	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     bsl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cls	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cls	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cls	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cls	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cls	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cls	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     clz	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     clz	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     clz	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     clz	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     clz	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     clz	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmeq	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmeq	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmeq	v0.16b, v0.16b, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmeq	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmge	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmge	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmge	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmge	v0.8b, v0.8b, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmgt	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmgt	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmgt	v0.2s, v0.2s, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmgt	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmhi	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmhi	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmhs	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmhs	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmle	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmle	v0.2d, v0.2d, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmlt	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmlt	v0.8h, v0.8h, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmtst	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cmtst	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cnt	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     cnt	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     dup	v0.16b, w28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     dup	v0.2d, x28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     dup	v0.2s, w28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     dup	v0.4h, w28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     dup	v0.4s, w28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     dup	v0.8b, w28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     dup	v0.8h, w28
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     eor	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ext	v0.16b, v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ext	v0.8b, v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabd	d29, d24, d20
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabd	s29, s24, s20
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabs	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabs	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabs	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabs	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fabs	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     facge	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     facge	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     facge	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     facgt	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     facgt	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     facgt	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     faddp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     faddp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmeq	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmeq	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmeq	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmeq	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmeq	v0.2s, v0.2s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmeq	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmge	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmge	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmge	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmge	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmge	v0.2d, v0.2d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmge	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmgt	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmgt	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmgt	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmgt	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmgt	v0.4s, v0.4s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmgt	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmle	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmle	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmle	v0.2d, v0.2d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmlt	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmlt	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcmlt	v0.4s, v0.4s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtas	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtas	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtas	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtas	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtas	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtas	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtas	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtau	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtau	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtau	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtau	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtau	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtau	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtau	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtl	v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtl	v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtl2	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtl2	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtms	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtms	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtms	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtms	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtms	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtms	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtms	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtmu	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtmu	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtmu	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtmu	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtmu	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtmu	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtmu	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtn	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtn2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtns	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtns	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtns	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtns	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtns	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtns	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtns	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtnu	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtnu	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtnu	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtnu	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtnu	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtnu	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtnu	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtps	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtps	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtps	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtps	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtps	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtps	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtps	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtpu	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtpu	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtpu	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtpu	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtpu	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtpu	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtpu	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtxn	s22, d13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtxn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtxn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	d21, d12, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	s21, s12, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzs	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	d21, d12, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	s21, s12, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fcvtzu	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fdiv	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmax	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmax	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxnm	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxnm	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxnm	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxnmp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxnmp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxnmp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmaxp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmin	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmin	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminnm	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminnm	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminnm	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminnmp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminnmp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminnmp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fminp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmla	d0, d1, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmla	s0, s1, v0.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmla	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmls	d0, d4, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmls	s3, s5, v0.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmls	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	v0.2d, #-1.25000000
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	v0.2s, #13.00000000
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmov	v0.4s, #1.00000000
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmul	d0, d1, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmul	s0, s1, v0.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmul	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmulx	d0, d4, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmulx	d23, d11, d1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmulx	s20, s22, s15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmulx	s3, s5, v0.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmulx	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmulx	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fmulx	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fneg	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fneg	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fneg	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fneg	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fneg	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frecpe	d13, d13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frecpe	s19, s14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frecpe	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frecpe	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frecpe	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frecpe	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frecpe	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frecps	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frecps	d22, d30, d21
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frecps	s21, s16, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frecpx	d16, d19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frecpx	s18, s10
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinta	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinta	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinta	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinta	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinta	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinti	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinti	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinti	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinti	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frinti	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintm	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintm	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintm	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintm	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintm	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintn	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintn	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintn	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintn	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintn	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintp	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintp	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintp	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintp	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintp	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintx	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintx	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintx	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintx	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintx	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintz	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintz	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintz	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintz	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frintz	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frsqrte	d21, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frsqrte	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frsqrte	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frsqrte	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frsqrte	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frsqrte	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     frsqrte	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frsqrts	d8, d22, d18
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frsqrts	s21, s5, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     frsqrts	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fsqrt	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fsqrt	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fsqrt	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fsqrt	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     fsqrt	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     fsub	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ld1	{ v0.16b }, [x0]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -     ld1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -     ld1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -     ld1	{ v0.4s, v1.4s }, [sp], #32
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -     ld1	{ v0.4s, v1.4s, v2.4s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -     ld1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     ld1	{ v0.8h }, [x15], x2
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -     ld1	{ v0.8h, v1.8h }, [x15]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     ld1	{ v0.b }[9], [x0]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     ld1	{ v0.b }[9], [x0], #1
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     ld1r	{ v0.16b }, [x0]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     ld1r	{ v0.16b }, [x0], #1
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     ld1r	{ v0.8h }, [x15]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     ld1r	{ v0.8h }, [x15], #2
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -     ld2	{ v0.16b, v1.16b }, [x0], x1
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -     1.50   1.50    -     ld2	{ v0.8b, v1.8b }, [x0]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -     ld2	{ v0.h, v1.h }[7], [x15]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -     ld2	{ v0.h, v1.h }[7], [x15], #4
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -     ld2r	{ v0.2d, v1.2d }, [x0]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -     ld2r	{ v0.2d, v1.2d }, [x0], #16
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -     ld2r	{ v0.4s, v1.4s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -     ld2r	{ v0.4s, v1.4s }, [sp], #8
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -     1.50   1.50    -     ld3	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -     1.50   1.50    -     ld3	{ v0.8h, v1.8h, v2.8h }, [x15], x2
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -     1.50   1.50    -     ld3	{ v0.s, v1.s, v2.s }[3], [sp]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -     1.50   1.50    -     ld3	{ v0.s, v1.s, v2.s }[3], [sp], x3
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -     1.50   1.50    -     ld3r	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -     1.50   1.50    -     ld3r	{ v0.4h, v1.4h, v2.4h }, [x15], #6
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -     1.50   1.50    -     ld3r	{ v0.8b, v1.8b, v2.8b }, [x0]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -     1.50   1.50    -     ld3r	{ v0.8b, v1.8b, v2.8b }, [x0], #3
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -     ld4	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -     ld4	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -     ld4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -     ld4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0], #32
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -     ld4	{ v0.h, v1.h, v2.h, v3.h }[7], [x0], x0
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -     ld4r	{ v0.1d, v1.1d, v2.1d, v3.1d }, [sp]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -     ld4r	{ v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -     ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -     ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mla	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mls	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	b0, v0.b[15]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	d6, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	h2, v0.h[5]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	s17, v0.s[2]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	v2.b[0], v0.b[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	v2.h[1], v0.h[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	v2.s[2], v0.s[2]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	v2.d[1], v0.d[1]
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     0.50   0.50    -     mov	v0.b[0], w8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     0.50   0.50    -     mov	v0.h[1], w8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     0.50   0.50    -     mov	v0.s[2], w8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     0.50   0.50    -     mov	v0.d[1], x8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     movi	d15, #0xff00ff00ff00ff
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     movi	v0.16b, #31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     movi	v0.2d, #0xff0000ff0000ffff
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     movi	v0.2s, #8, msl #8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     movi	v0.4s, #255, lsl #24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     movi	v0.8b, #255
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mvni	v0.2s, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mvni	v0.4s, #16, msl #16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     neg	d29, d24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     neg	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     neg	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     neg	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     neg	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     neg	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     neg	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     neg	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mvn	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mvn	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     orn	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     mov	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     orr	v0.8h, #31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     pmul	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     pmul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     pmull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     pmull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     raddhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     raddhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     raddhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     raddhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     raddhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     raddhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rbit	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rbit	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev16	v21.8b, v1.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev16	v30.16b, v31.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev32	v0.4h, v9.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev32	v21.8b, v1.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev32	v30.16b, v31.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev32	v4.8h, v7.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev64	v0.16b, v31.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev64	v1.8b, v9.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev64	v13.4h, v21.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev64	v2.8h, v4.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev64	v4.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     rev64	v6.4s, v8.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rsubhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rsubhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rsubhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rsubhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rsubhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     rsubhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saba	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sadalp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sadalp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sadalp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sadalp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sadalp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sadalp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddlp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddlp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddlp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddlp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddlp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddlp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     saddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	d21, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	d21, d12, #64
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	s22, s13, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     scvtf	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shl	d7, d10, #12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     shrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     shrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     shrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     shrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     shrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     shrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shsub	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     shsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sli	d10, d14, #12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sli	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sli	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sli	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sli	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sli	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sli	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sli	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smax	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smax	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smaxp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smaxp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smin	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smin	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smin	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sminp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sminp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sminp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlsl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smlsl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     smull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	b19, b14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	d18, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	h21, h15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	s20, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqabs	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqadd	b20, b11, b15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlal	d19, s24, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlal	d8, s9, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlal	s0, h0, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlal	s17, h27, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlsl	d8, s9, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlsl	s0, h0, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmulh	h10, h11, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmulh	s20, s21, s2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmulh	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmulh	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmull	d1, s1, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmull	d15, s22, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmull	s1, h1, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmull	s12, h22, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqdmull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	b19, b14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	d18, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	h21, h15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	s20, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqneg	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrdmulh	h10, h11, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrdmulh	s20, s21, s2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrdmulh	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrdmulh	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshl	d31, d31, d31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshl	h3, h4, h15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrn	b10, h13, #2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrn	h15, s10, #6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrn	s15, d12, #9
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrun	b17, h10, #6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrun	h10, s13, #15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrun	s22, d16, #31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrun	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrun	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrun	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrun2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrun2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqrshrun2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	b11, b19, #7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	d15, d16, #51
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	d31, d31, d31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	h13, h18, #11
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	h3, h4, h15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	s14, s17, #22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshl	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	b15, b18, #6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	d11, d13, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	h19, h17, #6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	s16, s14, #25
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshlu	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshrn	b10, h15, #5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshrn	h17, s10, #4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshrn	s18, d10, #31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshrun	b15, h10, #7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshrun	h20, s14, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqshrun	s10, d15, #15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrun	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrun	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrun	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrun2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrun2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     sqshrun2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqsub	s20, s10, s7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqsub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqsub	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtn	b18, h18
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtn	h20, s17
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtn	s19, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtn	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtn	v0.8b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtn2	v0.16b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtn2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtun	b19, h14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtun	h21, s15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtun	s20, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtun	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtun	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtun	v0.8b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtun2	v0.16b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtun2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sqxtun2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srhadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srhadd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srhadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sri	d10, d12, #14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sri	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sri	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sri	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sri	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sri	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sri	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sri	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshl	d16, d16, d16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshr	d19, d18, #7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srsra	d15, d11, #19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srsra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srsra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srsra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srsra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srsra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srsra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     srsra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshl	d31, d31, d31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshll	v0.2d, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshll2	v0.4s, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshr	d15, d16, #12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssra	d18, d12, #21
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ssubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   st1	{ v0.16b }, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.50   1.50    -      -     3.00   st1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00   2.00    -      -     4.00   st1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   st1	{ v0.4s, v1.4s }, [sp], #32
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.50   1.50    -      -     3.00   st1	{ v0.4s, v1.4s, v2.4s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00   2.00    -      -     4.00   st1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -     1.00   st1	{ v0.8h }, [x15], x2
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -     2.00   st1	{ v0.8h, v1.8h }, [x15]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50   0.50   0.50   1.00   st1	{ v0.d }[1], [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50   0.50   0.50   1.00   st1	{ v0.d }[1], [x0], #8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00   1.00   1.00   2.00   st2	{ v0.16b, v1.16b }, [x0], x1
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00   1.00   1.00   2.00   st2	{ v0.8b, v1.8b }, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00   1.00   1.00   2.00   st2	{ v0.s, v1.s }[3], [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00   1.00   1.00   2.00   st2	{ v0.s, v1.s }[3], [sp], #8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.50   1.50   1.50   1.50   3.00   st3	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.50   1.50   1.50   1.50   3.00   st3	{ v0.8h, v1.8h, v2.8h }, [x15], x2
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.50   1.50   1.50   1.50   3.00   st3	{ v0.h, v1.h, v2.h }[7], [x15]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.50   1.50   1.50   1.50   3.00   st3	{ v0.h, v1.h, v2.h }[7], [x15], #6
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00   2.00   3.00   3.00   4.00   st4	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00   2.00   2.00   2.00   4.00   st4	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00   2.00   2.00   2.00   4.00   st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00   2.00   2.00   2.00   4.00   st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sub	d15, d5, d16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     sub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	b19, b14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	d18, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	h20, h15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	s21, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     suqadd	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     tbl	v0.16b, { v0.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     tbl	v0.16b, { v0.16b, v1.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     tbl	v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -     tbl	v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     tbl	v0.8b, { v0.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     tbl	v0.8b, { v0.16b, v1.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     tbl	v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -     tbl	v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     tbx	v0.16b, { v0.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     tbx	v0.16b, { v0.16b, v1.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     tbx	v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -     tbx	v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     tbx	v0.8b, { v0.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     tbx	v0.8b, { v0.16b, v1.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     tbx	v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -     tbx	v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     trn2	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaba	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uadalp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uadalp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uadalp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uadalp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uadalp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uadalp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddlp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddlp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddlp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddlp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddlp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddlp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uaddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	d21, d14, #64
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	s22, s13, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ucvtf	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uhsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umax	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umax	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umax	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umaxp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umaxp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umaxp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umin	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umin	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uminp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uminp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlsl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umlsl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     umull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqadd	h0, h1, h5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshl	b11, b20, b30
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshl	s23, s20, s16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshrn	b10, h12, #5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshrn	h12, s10, #14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshrn	s10, d10, #25
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqrshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	b11, b20, b30
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	b18, b15, #6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	d15, d12, #19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	h11, h18, #7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	s14, s19, #18
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	s23, s20, s16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshrn	b12, h10, #7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshrn	h10, s14, #5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshrn	s10, d12, #13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqsub	d16, d16, d16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uqsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     uqxtn	b18, h18
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     uqxtn	h20, s17
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     uqxtn	s19, d14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     uqxtn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     uqxtn	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     uqxtn	v0.8b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     uqxtn2	v0.16b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     uqxtn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     uqxtn2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urecpe	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urecpe	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urhadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshl	d8, d7, d4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshr	d20, d23, #31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     urshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursqrte	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursqrte	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursra	d18, d10, #13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ursra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushl	d0, d0, d0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushll	v0.4s, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushll2	v0.8h, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushr	d10, d17, #18
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     ushr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	b19, b14
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	d18, d22
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	h20, h15
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	s21, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usqadd	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usra	d20, d13, #61
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     usubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     uzp2	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     xtn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     xtn	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     xtn	v0.8b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     xtn2	v0.16b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     xtn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     xtn2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     zip2	v0.8h, v0.8h, v0.8h
diff --git a/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/shifted-register.s b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/shifted-register.s
new file mode 100644
index 00000000000000..27e0279a701013
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Ampere/Ampere1B/shifted-register.s
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -march=aarch64 -mcpu=ampere1b -resource-pressure=false < %s | FileCheck %s
+
+  add	w0, w1, w2, lsl #0
+  sub	x3, x4, x5, lsl #1
+  adds	x6, x7, x8, lsr #2
+  subs	x9, x10, x11, asr #3
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      156
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    12
+# CHECK-NEXT: uOps Per Cycle:    3.85
+# CHECK-NEXT: IPC:               2.56
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.25                        add	w0, w1, w2
+# CHECK-NEXT:  1      1     0.25                        sub	x3, x4, x5, lsl #1
+# CHECK-NEXT:  2      2     0.50                        adds	x6, x7, x8, lsr #2
+# CHECK-NEXT:  2      2     0.50                        subs	x9, x10, x11, asr #3
diff --git a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test
index fc5856691f8dca..f88b7575002a94 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test
@@ -48,6 +48,9 @@
 # RUN: llvm-objcopy -I binary -O elf64-loongarch %t.txt %t.la64.o
 # RUN: llvm-readobj --file-headers %t.la64.o | FileCheck %s --check-prefixes=CHECK,LE,LA64,64
 
+# RUN: llvm-objcopy -I binary -O elf64-s390 %t.txt %t.s390x.o
+# RUN: llvm-readobj --file-headers %t.s390x.o | FileCheck %s --check-prefixes=CHECK,BE,S390X,64
+
 # CHECK: Format:
 # 32-SAME:      elf32-
 # 64-SAME:      elf64-
@@ -64,6 +67,7 @@
 # PPCLE-SAME:   powerpcle{{$}}
 # SPARC-SAME:   sparc
 # SPARCEL-SAME: sparc
+# S390X-SAME:   s390
 # X86-64-SAME:  x86-64
 
 # AARCH64-NEXT: Arch: aarch64
@@ -81,6 +85,7 @@
 # RISCV64-NEXT: Arch: riscv64
 # SPARC-NEXT:   Arch: sparc{{$}}
 # SPARCEL-NEXT: Arch: sparcel
+# S390X-NEXT:   Arch: s390x
 # X86-64-NEXT:  Arch: x86_64
 
 # 32-NEXT:      AddressSize: 32bit
@@ -116,6 +121,7 @@
 # RISCV64-NEXT:   Machine: EM_RISCV (0xF3)
 # SPARC-NEXT:     Machine: EM_SPARC (0x2)
 # SPARCEL-NEXT:   Machine: EM_SPARC (0x2)
+# S390X-NEXT:     Machine: EM_S390 (0x16)
 # X86-64-NEXT:    Machine: EM_X86_64 (0x3E)
 
 # CHECK-NEXT:     Version: 1
diff --git a/llvm/test/tools/llvm-objcopy/ELF/cross-arch-headers.test b/llvm/test/tools/llvm-objcopy/ELF/cross-arch-headers.test
index 882940c05e19c2..9a8128611792d5 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/cross-arch-headers.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/cross-arch-headers.test
@@ -117,6 +117,10 @@
 # RUN: llvm-readobj --file-headers %t.elf64_loongarch.o | FileCheck %s --check-prefixes=CHECK,LE,LA64,64,SYSV
 # RUN: llvm-readobj --file-headers %t.elf64_loongarch.dwo | FileCheck %s --check-prefixes=CHECK,LE,LA64,64,SYSV
 
+# RUN: llvm-objcopy %t.o -O elf64-s390 %t.elf64_s390.o --split-dwo=%t.elf64_s390.dwo
+# RUN: llvm-readobj --file-headers %t.elf64_s390.o | FileCheck %s --check-prefixes=CHECK,BE,S390X,64,SYSV
+# RUN: llvm-readobj --file-headers %t.elf64_s390.dwo | FileCheck %s --check-prefixes=CHECK,BE,S390X,64,SYSV
+
 !ELF
 FileHeader:
   Class:           ELFCLASS32
@@ -160,6 +164,7 @@ Symbols:
 # RISCV32-SAME: riscv{{$}}
 # RISCV64-SAME: riscv{{$}}
 # SPARC-SAME:   sparc
+# S390X-SAME:   s390
 # X86-64-SAME:  x86-64
 # DEFAULT-SAME: unknown
 
@@ -182,6 +187,7 @@ Symbols:
 # RISCV64-NEXT:  Arch: riscv64
 # SPARC-NEXT:    Arch: sparc{{$}}
 # SPARCEL-NEXT:  Arch: sparcel
+# S390X-NEXT:    Arch: s390x
 # X86-64-NEXT:   Arch: x86_64
 # DEFAULT-NEXT:  Arch: unknown
 
@@ -210,6 +216,7 @@ Symbols:
 # RISCV32: Machine: EM_RISCV (0xF3)
 # RISCV64: Machine: EM_RISCV (0xF3)
 # SPARC:   Machine: EM_SPARC (0x2)
+# S390X:   Machine: EM_S390 (0x16)
 # X86-64:  Machine: EM_X86_64 (0x3E)
 
 # 32: HeaderSize: 52
diff --git a/llvm/test/tools/llvm-objdump/openbsd-headers.test b/llvm/test/tools/llvm-objdump/openbsd-headers.test
index f547854feeeedf..84fa59bdf89f5c 100644
--- a/llvm/test/tools/llvm-objdump/openbsd-headers.test
+++ b/llvm/test/tools/llvm-objdump/openbsd-headers.test
@@ -11,6 +11,8 @@
 # CHECK-NEXT:      filesz 0x0000000000000000 memsz 0x0000000000000000 flags ---
 # CHECK-NEXT: OPENBSD_NOBTCFI off    0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**0
 # CHECK-NEXT:      filesz 0x0000000000000000 memsz 0x0000000000000000 flags ---
+# CHECK-NEXT: OPENBSD_SYSCALLS off    0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**0
+# CHECK-NEXT:      filesz 0x0000000000000000 memsz 0x0000000000000000 flags ---
 # CHECK-NEXT: OPENBSD_BOOTDATA off    0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**0
 # CHECK-NEXT:      filesz 0x0000000000000000 memsz 0x0000000000000000 flags ---
 
@@ -25,4 +27,5 @@ ProgramHeaders:
   - Type: 0x65a3dbe6 ## PT_OPENBSD_RANDOMIZE
   - Type: 0x65a3dbe7 ## PT_OPENBSD_WXNEEDED
   - Type: 0x65a3dbe8 ## PT_OPENBSD_NOBTCFI
+  - Type: 0x65a3dbe9 ## PT_OPENBSD_SYSCALLS
   - Type: 0x65a41be6 ## PT_OPENBSD_BOOTDATA
diff --git a/llvm/test/tools/llvm-readobj/COFF/file-headers.test b/llvm/test/tools/llvm-readobj/COFF/file-headers.test
index b83a6cf5b972b3..32f39e196b0001 100644
--- a/llvm/test/tools/llvm-readobj/COFF/file-headers.test
+++ b/llvm/test/tools/llvm-readobj/COFF/file-headers.test
@@ -323,6 +323,7 @@ symbols:
 #      IMPORTLIB:Format: COFF-import-file-i386
 # IMPORTLIB-NEXT:Type: code
 # IMPORTLIB-NEXT:Name type: noprefix
+# IMPORTLIB-NEXT:Export name: func
 # IMPORTLIB-NEXT:Symbol: __imp__func
 # IMPORTLIB-NEXT:Symbol: _func
 # IMPORTLIB-NOT:{{.}}
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
new file mode 100644
index 00000000000000..f4c73de7ca6c9d
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
@@ -0,0 +1,32 @@
+# UNSUPPORTED: zlib
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t
+
+# CHECK:      String dump of section '.a':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
+# CHECK-NEXT: [     0] .
+# CHECK-NEXT: [     8] .
+# CHECK-NEXT: [    10] .
+# CHECK-NEXT: [    18] x.c.
+# CHECK-NEXT: [    1e] .
+# CHECK-NEXT: [    20] .
+# CHECK-NEXT: Hex dump of section '.b':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
+# CHECK-NEXT: 0x00000000 01000000 00000000 01000000 00000000 ................
+# CHECK-NEXT: 0x00000010 01000000 00000000 789c6304 00000200 ........x.c.....
+# CHECK-NEXT: 0x00000020 02                                  .
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .a
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 010000000000000001000000000000000100000000000000789c63040000020002
+  - Name: .b
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 010000000000000001000000000000000100000000000000789c63040000020002
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
new file mode 100644
index 00000000000000..ea7a8854eb1a0c
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
@@ -0,0 +1,76 @@
+# REQUIRES: zlib
+## Test --decompress/-z.
+
+# RUN: yaml2obj %s -o %t
+
+# RUN: llvm-readelf -z -x .strings -x .not_null_terminated %t | FileCheck %s --check-prefix=HEX
+# RUN: llvm-readobj --decompress -p .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=STR
+
+# HEX:      Hex dump of section '.strings':
+# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st
+# HEX-NEXT: 0x00000010 72696e67 7300                       rings.
+# HEX:      Hex dump of section '.not_null_terminated':
+# HEX-NEXT: 0x00000000 6e6f006e 756c6c                     no.null
+
+# STR:      String dump of section '.strings':
+# STR-NEXT: [ 0] here
+# STR-NEXT: [ 5] are
+# STR-NEXT: [ 9] some
+# STR-NEXT: [ e] strings
+# STR-EMPTY:
+# STR-NEXT: String dump of section '.not_null_terminated':
+# STR-NEXT: [ 0] no
+# STR-NEXT: [ 3] null{{$}}
+# STR-NOT:  {{.}}
+
+# RUN: llvm-readobj -x .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=COMPRESSED
+
+# COMPRESSED:      String dump of section '.not_null_terminated':
+# COMPRESSED-NEXT: [     0] no
+# COMPRESSED-NEXT: [     3] null
+# COMPRESSED-NEXT: Hex dump of section '.strings':
+# COMPRESSED-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
+# COMPRESSED-NEXT: 0x00000010 00000000 00000000 789ccb48 2d4a6548 ........x..H-JeH
+# COMPRESSED-NEXT: 0x00000020 04e2e2fc 5c205152 9499975e cc000058 ....\ QR...^...X
+# COMPRESSED-NEXT: 0x00000030 2e079b                              ...
+
+# RUN: llvm-readelf -z -p .invalid1 -x .invalid2 -x .invalid3 %t 2>&1 | FileCheck %s -DFILE=%t --check-prefix=INVALID
+
+# INVALID:      String dump of section '.invalid1':
+# INVALID-NEXT: warning: '[[FILE]]': corrupted compressed section header
+# INVALID-NEXT: [     0] .
+# INVALID-NEXT: Hex dump of section '.invalid2':
+# INVALID-NEXT: warning: '[[FILE]]': zlib error: Z_DATA_ERROR
+# INVALID-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
+# INVALID-NEXT: 0x00000010 00000000 00000000 78                ........x
+# INVALID-EMPTY:
+# INVALID-NEXT: Hex dump of section '.invalid3':
+# INVALID-NEXT: warning: '[[FILE]]': unsupported compression type (3)
+# INVALID-NEXT: 0x00000000 03000000 00000000 04000000 00000000 ................
+# INVALID-NEXT: 0x00000010 00000000 00000000 789c6360          ........x.c`
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .strings
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 010000000000000016000000000000000000000000000000789ccb482d4a654804e2e2fc5c2051529499975ecc0000582e079b
+  - Name: .not_null_terminated
+    Type: SHT_PROGBITS
+    Content: 6e6f006e756c6c
+  - Name: .invalid1
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 01
+  - Name: .invalid2
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 01000000000000001600000000000000000000000000000078
+  - Name: .invalid3
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 030000000000000004000000000000000000000000000000789c6360
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
new file mode 100644
index 00000000000000..65da952687f526
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
@@ -0,0 +1,31 @@
+# UNSUPPORTED: zstd
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t
+
+# CHECK:      String dump of section '.a':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time
+# CHECK-NEXT: [     0] .
+# CHECK-NEXT: [     8] .
+# CHECK-NEXT: [    10] .
+# CHECK-NEXT: [    18] (./. ..
+# CHECK-NEXT: [    21] .
+# CHECK-NEXT: Hex dump of section '.b':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time
+# CHECK-NEXT: 0x00000000 02000000 00000000 01000000 00000000 ................
+# CHECK-NEXT: 0x00000010 01000000 00000000 28b52ffd 20010900 ........(./. ...
+# CHECK-NEXT: 0x00000020 0001                                ..
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .a
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001
+  - Name: .b
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test
new file mode 100644
index 00000000000000..519db879b18c17
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test
@@ -0,0 +1,28 @@
+# REQUIRES: zstd
+## Test --decompress/-z for zstd.
+
+# RUN: yaml2obj %s -o %t
+
+# RUN: llvm-readelf -z -x .strings %t | FileCheck %s --check-prefix=HEX
+# RUN: llvm-readobj --decompress -p .strings %t | FileCheck %s --check-prefix=STR
+
+# HEX:      Hex dump of section '.strings':
+# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st
+# HEX-NEXT: 0x00000010 72696e67 7300                       rings.
+
+# STR:      String dump of section '.strings':
+# STR-NEXT: [ 0] here
+# STR-NEXT: [ 5] are
+# STR-NEXT: [ 9] some
+# STR-NEXT: [ e] strings
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .strings
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 02000000000000001600000000000000000000000000000028b52ffd2016b10000686572650061726500736f6d6500737472696e677300
diff --git a/llvm/test/tools/llvm-readobj/ELF/program-headers.test b/llvm/test/tools/llvm-readobj/ELF/program-headers.test
index 702a06b6403f0a..856cf378ddad95 100644
--- a/llvm/test/tools/llvm-readobj/ELF/program-headers.test
+++ b/llvm/test/tools/llvm-readobj/ELF/program-headers.test
@@ -29,68 +29,70 @@
 # RUN:   FileCheck %s --check-prefixes=ELF64,MAPPING --strict-whitespace --match-full-lines
 # RUN: llvm-readobj -l %t64.elf | FileCheck %s --check-prefixes=ELF-LLVM,ELF64-LLVM
 
-#       ELF32:There are 25 program headers, starting at offset 52
+#       ELF32:There are 26 program headers, starting at offset 52
 # ELF32-EMPTY:
 # ELF32-NEXT:Program Headers:
 # ELF32-NEXT:  Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
-# ELF32-NEXT:  PHDR           0x000354 0x00001000 0x00001000 0x00003 0x00003  W  0x1
-# ELF32-NEXT:  PHDR           0x000357 0x00002000 0x00002000 0x00007 0x00007   E 0x1
-# ELF32-NEXT:  NULL           0x000357 0x00002000 0x00002000 0x00007 0x00007   E 0x1
-# ELF32-NEXT:  DYNAMIC        0x000354 0x00001000 0x00001000 0x00003 0x00003 RWE 0x1
-# ELF32-NEXT:  INTERP         0x00035e 0x00003000 0x00003000 0x00004 0x00004 RW  0x1
+# ELF32-NEXT:  PHDR           0x000374 0x00001000 0x00001000 0x00003 0x00003  W  0x1
+# ELF32-NEXT:  PHDR           0x000377 0x00002000 0x00002000 0x00007 0x00007   E 0x1
+# ELF32-NEXT:  NULL           0x000377 0x00002000 0x00002000 0x00007 0x00007   E 0x1
+# ELF32-NEXT:  DYNAMIC        0x000374 0x00001000 0x00001000 0x00003 0x00003 RWE 0x1
+# ELF32-NEXT:  INTERP         0x00037e 0x00003000 0x00003000 0x00004 0x00004 RW  0x1
 # ELF32-NEXT:      [Requesting program interpreter: ABC]
-# ELF32-NEXT:  NOTE           0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  SHLIB          0x000354 0x00001000 0x00001000 0x00001 0x00001     0x1
-# ELF32-NEXT:  TLS            0x000362 0x00004000 0x00004000 0x00001 0x00001     0x1
-# ELF32-NEXT:  <unknown>: 0x60000000 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  GNU_EH_FRAME   0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  SUNW_UNWIND    0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  GNU_STACK      0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  GNU_RELRO      0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  GNU_PROPERTY   0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  OPENBSD_MUTABLE 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  OPENBSD_RANDOMIZE 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  OPENBSD_WXNEEDED 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  OPENBSD_NOBTCFI 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  OPENBSD_BOOTDATA 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  <unknown>: 0x6fffffff 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  <unknown>: 0x70000000 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  <unknown>: 0x70000001 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  <unknown>: 0x70000002 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  <unknown>: 0x70000003 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
-# ELF32-NEXT:  <unknown>: 0x7fffffff 0x000354 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  NOTE           0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  SHLIB          0x000374 0x00001000 0x00001000 0x00001 0x00001     0x1
+# ELF32-NEXT:  TLS            0x000382 0x00004000 0x00004000 0x00001 0x00001     0x1
+# ELF32-NEXT:  <unknown>: 0x60000000 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  GNU_EH_FRAME   0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  SUNW_UNWIND    0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  GNU_STACK      0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  GNU_RELRO      0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  GNU_PROPERTY   0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  OPENBSD_MUTABLE 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  OPENBSD_RANDOMIZE 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  OPENBSD_WXNEEDED 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  OPENBSD_NOBTCFI 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  OPENBSD_SYSCALLS 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  OPENBSD_BOOTDATA 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  <unknown>: 0x6fffffff 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  <unknown>: 0x70000000 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  <unknown>: 0x70000001 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  <unknown>: 0x70000002 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  <unknown>: 0x70000003 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
+# ELF32-NEXT:  <unknown>: 0x7fffffff 0x000374 0x00001000 0x00001000 0x00003 0x00003     0x1
 # ELF32-EMPTY:
 
-#       ELF64:There are 25 program headers, starting at offset 64
+#       ELF64:There are 26 program headers, starting at offset 64
 # ELF64-EMPTY:
 # ELF64-NEXT:Program Headers:
 # ELF64-NEXT:  Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
-# ELF64-NEXT:  PHDR           0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003  W  0x1
-# ELF64-NEXT:  PHDR           0x0005bb 0x0000000000002000 0x0000000000002000 0x000007 0x000007   E 0x1
-# ELF64-NEXT:  NULL           0x0005bb 0x0000000000002000 0x0000000000002000 0x000007 0x000007   E 0x1
-# ELF64-NEXT:  DYNAMIC        0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 RWE 0x1
-# ELF64-NEXT:  INTERP         0x0005c2 0x0000000000003000 0x0000000000003000 0x000004 0x000004 RW  0x1
+# ELF64-NEXT:  PHDR           0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003  W  0x1
+# ELF64-NEXT:  PHDR           0x0005f3 0x0000000000002000 0x0000000000002000 0x000007 0x000007   E 0x1
+# ELF64-NEXT:  NULL           0x0005f3 0x0000000000002000 0x0000000000002000 0x000007 0x000007   E 0x1
+# ELF64-NEXT:  DYNAMIC        0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 RWE 0x1
+# ELF64-NEXT:  INTERP         0x0005fa 0x0000000000003000 0x0000000000003000 0x000004 0x000004 RW  0x1
 # ELF64-NEXT:      [Requesting program interpreter: ABC]
-# ELF64-NEXT:  NOTE           0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  SHLIB          0x0005b8 0x0000000000001000 0x0000000000001000 0x000001 0x000001     0x1
-# ELF64-NEXT:  TLS            0x0005c6 0x0000000000004000 0x0000000000004000 0x000001 0x000001     0x1
-# ELF64-NEXT:  <unknown>: 0x60000000 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  GNU_EH_FRAME   0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  SUNW_UNWIND    0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  GNU_STACK      0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  GNU_RELRO      0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  GNU_PROPERTY   0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  OPENBSD_MUTABLE 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  OPENBSD_RANDOMIZE 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  OPENBSD_WXNEEDED 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  OPENBSD_NOBTCFI 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  OPENBSD_BOOTDATA 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  <unknown>: 0x6fffffff 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  <unknown>: 0x70000000 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  <unknown>: 0x70000001 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  <unknown>: 0x70000002 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  <unknown>: 0x70000003 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
-# ELF64-NEXT:  <unknown>: 0x7fffffff 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  NOTE           0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  SHLIB          0x0005f0 0x0000000000001000 0x0000000000001000 0x000001 0x000001     0x1
+# ELF64-NEXT:  TLS            0x0005fe 0x0000000000004000 0x0000000000004000 0x000001 0x000001     0x1
+# ELF64-NEXT:  <unknown>: 0x60000000 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  GNU_EH_FRAME   0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  SUNW_UNWIND    0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  GNU_STACK      0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  GNU_RELRO      0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  GNU_PROPERTY   0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  OPENBSD_MUTABLE 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  OPENBSD_RANDOMIZE 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  OPENBSD_WXNEEDED 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  OPENBSD_NOBTCFI 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  OPENBSD_SYSCALLS 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  OPENBSD_BOOTDATA 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  <unknown>: 0x6fffffff 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  <unknown>: 0x70000000 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  <unknown>: 0x70000001 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  <unknown>: 0x70000002 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  <unknown>: 0x70000003 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
+# ELF64-NEXT:  <unknown>: 0x7fffffff 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003     0x1
 # ELF64-EMPTY:
 
 #      MAPPING: Section to Segment mapping:
@@ -120,13 +122,14 @@
 # MAPPING-NEXT:   22     .foo.begin .foo.end {{$}}
 # MAPPING-NEXT:   23     .foo.begin .foo.end {{$}}
 # MAPPING-NEXT:   24     .foo.begin .foo.end {{$}}
+# MAPPING-NEXT:   25     .foo.begin .foo.end {{$}}
 # MAPPING-NEXT:   None   .unused .strtab .shstrtab {{$}}
 
 # ELF-LLVM:      ProgramHeaders [
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_PHDR (0x6)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -138,8 +141,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_PHDR (0x6)
-# ELF32-LLVM-NEXT:   Offset: 0x357
-# ELF64-LLVM-NEXT:   Offset: 0x5BB
+# ELF32-LLVM-NEXT:   Offset: 0x377
+# ELF64-LLVM-NEXT:   Offset: 0x5F3
 # ELF-LLVM-NEXT:     VirtualAddress: 0x2000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x2000
 # ELF-LLVM-NEXT:     FileSize: 7
@@ -151,8 +154,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_NULL (0x0)
-# ELF32-LLVM-NEXT:   Offset: 0x357
-# ELF64-LLVM-NEXT:   Offset: 0x5BB
+# ELF32-LLVM-NEXT:   Offset: 0x377
+# ELF64-LLVM-NEXT:   Offset: 0x5F3
 # ELF-LLVM-NEXT:     VirtualAddress: 0x2000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x2000
 # ELF-LLVM-NEXT:     FileSize: 7
@@ -164,8 +167,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_DYNAMIC (0x2)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -179,8 +182,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_INTERP (0x3)
-# ELF32-LLVM-NEXT:   Offset: 0x35E
-# ELF64-LLVM-NEXT:   Offset: 0x5C2
+# ELF32-LLVM-NEXT:   Offset: 0x37E
+# ELF64-LLVM-NEXT:   Offset: 0x5FA
 # ELF-LLVM-NEXT:     VirtualAddress: 0x3000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x3000
 # ELF-LLVM-NEXT:     FileSize: 4
@@ -193,8 +196,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_NOTE (0x4)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -205,8 +208,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_SHLIB (0x5)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 1
@@ -217,8 +220,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_TLS (0x7)
-# ELF32-LLVM-NEXT:   Offset: 0x362
-# ELF64-LLVM-NEXT:   Offset: 0x5C6
+# ELF32-LLVM-NEXT:   Offset: 0x382
+# ELF64-LLVM-NEXT:   Offset: 0x5FE
 # ELF-LLVM-NEXT:     VirtualAddress: 0x4000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x4000
 # ELF-LLVM-NEXT:     FileSize: 1
@@ -229,8 +232,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: Unknown (0x60000000)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -241,8 +244,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_GNU_EH_FRAME (0x6474E550)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -253,8 +256,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_SUNW_UNWIND (0x6464E550)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -265,8 +268,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_GNU_STACK (0x6474E551)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -277,8 +280,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_GNU_RELRO (0x6474E552)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -289,8 +292,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_GNU_PROPERTY (0x6474E553)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -301,8 +304,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_OPENBSD_MUTABLE (0x65A3DBE5)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -313,8 +316,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_OPENBSD_RANDOMIZE (0x65A3DBE6)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -325,8 +328,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_OPENBSD_WXNEEDED (0x65A3DBE7)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -337,8 +340,20 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_OPENBSD_NOBTCFI (0x65A3DBE8)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
+# ELF-LLVM-NEXT:     VirtualAddress: 0x1000
+# ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
+# ELF-LLVM-NEXT:     FileSize: 3
+# ELF-LLVM-NEXT:     MemSize: 3
+# ELF-LLVM-NEXT:     Flags [ (0x0)
+# ELF-LLVM-NEXT:     ]
+# ELF-LLVM-NEXT:     Alignment: 1
+# ELF-LLVM-NEXT:   }
+# ELF-LLVM-NEXT:   ProgramHeader {
+# ELF-LLVM-NEXT:     Type: PT_OPENBSD_SYSCALLS (0x65A3DBE9)
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -349,8 +364,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: PT_OPENBSD_BOOTDATA (0x65A41BE6)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -361,8 +376,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: Unknown (0x6FFFFFFF)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -373,8 +388,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: Unknown (0x70000000)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -385,8 +400,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: Unknown (0x70000001)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -397,8 +412,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: Unknown (0x70000002)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -409,8 +424,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: Unknown (0x70000003)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -421,8 +436,8 @@
 # ELF-LLVM-NEXT:   }
 # ELF-LLVM-NEXT:   ProgramHeader {
 # ELF-LLVM-NEXT:     Type: Unknown (0x7FFFFFFF)
-# ELF32-LLVM-NEXT:   Offset: 0x354
-# ELF64-LLVM-NEXT:   Offset: 0x5B8
+# ELF32-LLVM-NEXT:   Offset: 0x374
+# ELF64-LLVM-NEXT:   Offset: 0x5F0
 # ELF-LLVM-NEXT:     VirtualAddress: 0x1000
 # ELF-LLVM-NEXT:     PhysicalAddress: 0x1000
 # ELF-LLVM-NEXT:     FileSize: 3
@@ -569,37 +584,42 @@ ProgramHeaders:
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 19: the PT_OPENBSD_BOOTDATA segment.
+## Case 19: the PT_OPENBSD_SYSCALLS segment.
+  - Type:     0x65a3dbe9 ## PT_OPENBSD_SYSCALLS
+    VAddr:    0x1000
+    FirstSec: .foo.begin
+    LastSec:  .foo.end
+## Case 20: the PT_OPENBSD_BOOTDATA segment.
   - Type:     0x65a41be6 ## PT_OPENBSD_BOOTDATA
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 20: the PT_HIOS segment.
+## Case 21: the PT_HIOS segment.
   - Type:     0x6fffffff ## PT_HIOS
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 21: the PT_LOPROC/PT_ARM_ARCHEXT/PT_MIPS_REGINFO segment.
+## Case 22: the PT_LOPROC/PT_ARM_ARCHEXT/PT_MIPS_REGINFO segment.
   - Type:     0x70000000 ## PT_LOPROC/PT_ARM_ARCHEXT/PT_MIPS_REGINFO
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 22: the PT_ARM_EXIDX/PT_MIPS_RTPROC segment.
+## Case 23: the PT_ARM_EXIDX/PT_MIPS_RTPROC segment.
   - Type:     0x70000001 ## PT_ARM_EXIDX, PT_MIPS_RTPROC
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 23: the PT_MIPS_OPTIONS segment.
+## Case 24: the PT_MIPS_OPTIONS segment.
   - Type:     0x70000002 ## PT_MIPS_OPTIONS
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 24: the PT_MIPS_ABIFLAGS/PT_RISCV_ATTRIBUTES segment.
+## Case 25: the PT_MIPS_ABIFLAGS/PT_RISCV_ATTRIBUTES segment.
   - Type:     0x70000003 ## PT_MIPS_ABIFLAGS/PT_RISCV_ATTRIBUTES
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 25: the PT_HIPROC segment.
+## Case 26: the PT_HIPROC segment.
   - Type:     0x7fffffff ## PT_HIPROC
     VAddr:    0x1000
     FirstSec: .foo.begin
@@ -610,9 +630,9 @@ ProgramHeaders:
 # RUN: llvm-readelf --program-headers %tarm.elf | FileCheck %s --check-prefix=ARM-GNU
 # RUN: llvm-readobj --program-headers %tarm.elf | FileCheck %s --check-prefix=ARM-LLVM
 
-# ARM-GNU:      <unknown>: 0x70000000 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
-# ARM-GNU-NEXT:            EXIDX      0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
-# ARM-GNU-NEXT: <unknown>: 0x70000002 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
+# ARM-GNU:      <unknown>: 0x70000000 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
+# ARM-GNU-NEXT:            EXIDX      0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
+# ARM-GNU-NEXT: <unknown>: 0x70000002 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
 
 # ARM-LLVM:      ProgramHeader {
 # ARM-LLVM:        Type: Unknown (0x70000000)
@@ -626,10 +646,10 @@ ProgramHeaders:
 # RUN: llvm-readelf --program-headers %tmips.elf | FileCheck %s --check-prefix=MIPS-GNU
 # RUN: llvm-readobj --program-headers %tmips.elf | FileCheck %s --check-prefix=MIPS-LLVM
 
-# MIPS-GNU:      REGINFO  0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
-# MIPS-GNU-NEXT: RTPROC   0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
-# MIPS-GNU-NEXT: OPTIONS  0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
-# MIPS-GNU-NEXT: ABIFLAGS 0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
+# MIPS-GNU:      REGINFO  0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
+# MIPS-GNU-NEXT: RTPROC   0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
+# MIPS-GNU-NEXT: OPTIONS  0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
+# MIPS-GNU-NEXT: ABIFLAGS 0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
 
 # MIPS-LLVM:      ProgramHeader {
 # MIPS-LLVM:        Type: PT_MIPS_REGINFO (0x70000000)
@@ -645,7 +665,7 @@ ProgramHeaders:
 # RUN: llvm-readelf --program-headers %triscv.elf | FileCheck %s --check-prefix=RISCV-GNU
 # RUN: llvm-readobj --program-headers %triscv.elf | FileCheck %s --check-prefix=RISCV-LLVM
 
-# RISCV-GNU:       ATTRIBUTES  0x0005b8 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
+# RISCV-GNU:       ATTRIBUTES  0x0005f0 0x0000000000001000 0x0000000000001000 0x000003 0x000003 0x1
 # RISCV-LLVM:      ProgramHeader {
 # RISCV-LLVM:        Type: PT_RISCV_ATTRIBUTES (0x70000003)
 
diff --git a/llvm/tools/llvm-cov/SourceCoverageView.cpp b/llvm/tools/llvm-cov/SourceCoverageView.cpp
index 71edd5fec4280a..5b85d7d86bfb94 100644
--- a/llvm/tools/llvm-cov/SourceCoverageView.cpp
+++ b/llvm/tools/llvm-cov/SourceCoverageView.cpp
@@ -139,7 +139,7 @@ bool SourceCoverageView::shouldRenderRegionMarkers(
 
 bool SourceCoverageView::hasSubViews() const {
   return !ExpansionSubViews.empty() || !InstantiationSubViews.empty() ||
-         !BranchSubViews.empty();
+         !BranchSubViews.empty() || !MCDCSubViews.empty();
 }
 
 std::unique_ptr<SourceCoverageView>
diff --git a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
index abc4c49ecae98e..b93d8cb035306b 100644
--- a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
+++ b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
@@ -246,6 +246,9 @@ tr:hover {
 tr:last-child {
   border-bottom: none;
 }
+tr:has(> td >a:target) > td.code > pre {
+  background-color: #ffa;
+}
 )";
 
 const char *EndHeader = "</head>";
@@ -990,15 +993,13 @@ void SourceCoverageViewHTML::renderMCDCView(raw_ostream &OS, MCDCView &MRV,
     std::string ColNoStr = Twine(DecisionRegion.ColumnStart).str();
     std::string TargetName = "L" + LineNoStr;
     OS << tag("span",
-              a("#" + TargetName, tag("span", LineNoStr + ":" + ColNoStr),
-                TargetName),
+              a("#" + TargetName, tag("span", LineNoStr + ":" + ColNoStr)),
               "line-number") +
               ") to (";
     LineNoStr = utostr(uint64_t(DecisionRegion.LineEnd));
     ColNoStr = utostr(uint64_t(DecisionRegion.ColumnEnd));
     OS << tag("span",
-              a("#" + TargetName, tag("span", LineNoStr + ":" + ColNoStr),
-                TargetName),
+              a("#" + TargetName, tag("span", LineNoStr + ":" + ColNoStr)),
               "line-number") +
               ")\n\n";
 
diff --git a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
index 73b7ffe16a9637..580da45ecfc0d8 100644
--- a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
+++ b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
@@ -382,7 +382,8 @@ void SourceCoverageViewText::renderMCDCView(raw_ostream &OS, MCDCView &MRV,
     colored_ostream(OS, raw_ostream::RED,
                     getOptions().Colors && Record.getPercentCovered() < 100.0,
                     /*Bold=*/false, /*BG=*/true)
-        << format("%0.2f", Record.getPercentCovered()) << "%\n";
+        << format("%0.2f", Record.getPercentCovered()) << "%";
+    OS << "\n";
     renderLinePrefix(OS, ViewDepth);
     OS << "\n";
   }
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index b2a133860197dd..769ed17ac4cbd5 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1294,10 +1294,11 @@ class MemoryMatcher {
 };
 
 static StringRef detectStubKind(const Session::MemoryRegionInfo &Stub) {
-  constexpr uint32_t Armv7MovWTle = 0xe300c000;
-  constexpr uint32_t Armv7BxR12le = 0xe12fff1c;
-  constexpr uint32_t Thumbv7MovWTle = 0x0c00f240;
-  constexpr uint16_t Thumbv7BxR12le = 0x4760;
+  using namespace support::endian;
+  auto Armv7MovWTle = byte_swap<uint32_t, endianness::little>(0xe300c000);
+  auto Armv7BxR12le = byte_swap<uint32_t, endianness::little>(0xe12fff1c);
+  auto Thumbv7MovWTle = byte_swap<uint32_t, endianness::little>(0x0c00f240);
+  auto Thumbv7BxR12le = byte_swap<uint16_t, endianness::little>(0x4760);
 
   MemoryMatcher M(Stub.getContent());
   if (M.matchMask(Thumbv7MovWTle)) {
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index cf6aaa1f06981f..2754d49645595d 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -547,11 +547,6 @@ int main(int argc, char **argv) {
     std::unique_ptr<MCAsmBackend> MAB(
         TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions));
     auto FOut = std::make_unique<formatted_raw_ostream>(*OS);
-    // FIXME: Workaround for bug in formatted_raw_ostream. Color escape codes
-    // are (incorrectly) written directly to the unbuffered raw_ostream wrapped
-    // by the formatted_raw_ostream.
-    if (Action == AC_CDisassemble)
-      FOut->SetUnbuffered();
     Str.reset(
         TheTarget->createAsmStreamer(Ctx, std::move(FOut), /*asmverbose*/ true,
                                      /*useDwarfDirectory*/ true, IP,
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index f15307181fad61..f63e5c61e802c8 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -299,6 +299,8 @@ static const StringMap<MachineInfo> TargetMap{
     // LoongArch
     {"elf32-loongarch", {ELF::EM_LOONGARCH, false, true}},
     {"elf64-loongarch", {ELF::EM_LOONGARCH, true, true}},
+    // SystemZ
+    {"elf64-s390", {ELF::EM_S390, true, false}},
 };
 
 static Expected<TargetInfo>
diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp
index 34861ee92128fd..fda99bd6d33e17 100644
--- a/llvm/tools/llvm-objdump/ELFDump.cpp
+++ b/llvm/tools/llvm-objdump/ELFDump.cpp
@@ -291,6 +291,9 @@ template <class ELFT> void ELFDumper<ELFT>::printProgramHeaders() {
     case ELF::PT_OPENBSD_RANDOMIZE:
       outs() << "OPENBSD_RANDOMIZE ";
       break;
+    case ELF::PT_OPENBSD_SYSCALLS:
+      outs() << "OPENBSD_SYSCALLS ";
+      break;
     case ELF::PT_OPENBSD_WXNEEDED:
       outs() << "OPENBSD_WXNEEDED ";
       break;
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 22b427f57658e1..7ecdd60313d065 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2032,13 +2032,6 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
 
       formatted_raw_ostream FOS(outs());
 
-      // FIXME: Workaround for bug in formatted_raw_ostream. Color escape codes
-      // are (incorrectly) written directly to the unbuffered raw_ostream
-      // wrapped by the formatted_raw_ostream.
-      if (DisassemblyColor == ColorOutput::Enable ||
-          DisassemblyColor == ColorOutput::Auto)
-        FOS.SetUnbuffered();
-
       std::unordered_map<uint64_t, std::string> AllLabels;
       std::unordered_map<uint64_t, std::vector<BBAddrMapLabel>> BBAddrMapLabels;
       if (SymbolizeOperands) {
diff --git a/llvm/tools/llvm-readobj/COFFImportDumper.cpp b/llvm/tools/llvm-readobj/COFFImportDumper.cpp
index 8aedc310ae3a9f..0ab2a17655653e 100644
--- a/llvm/tools/llvm-readobj/COFFImportDumper.cpp
+++ b/llvm/tools/llvm-readobj/COFFImportDumper.cpp
@@ -45,8 +45,14 @@ void dumpCOFFImportFile(const COFFImportFile *File, ScopedPrinter &Writer) {
   case COFF::IMPORT_NAME_UNDECORATE:
     Writer.printString("Name type", "undecorate");
     break;
+  case COFF::IMPORT_NAME_EXPORTAS:
+    Writer.printString("Name type", "export as");
+    break;
   }
 
+  if (H->getNameType() != COFF::IMPORT_ORDINAL)
+    Writer.printString("Export name", File->getExportName());
+
   for (const object::BasicSymbolRef &Sym : File->symbols()) {
     raw_ostream &OS = Writer.startLine();
     OS << "Symbol: ";
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index f369a63add1149..387124ad53e408 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1478,6 +1478,7 @@ static StringRef segmentTypeToString(unsigned Arch, unsigned Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_RANDOMIZE);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_WXNEEDED);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_NOBTCFI);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_SYSCALLS);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_BOOTDATA);
   default:
     return "";
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 59060ac217e32f..0d3fea71aafd42 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -14,6 +14,7 @@
 #include "ObjDumper.h"
 #include "llvm-readobj.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/Decompressor.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -142,8 +143,23 @@ getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
   return Ret;
 }
 
+static void maybeDecompress(const object::ObjectFile &Obj,
+                            StringRef SectionName, StringRef &SectionContent,
+                            SmallString<0> &Out) {
+  Expected<object::Decompressor> Decompressor = object::Decompressor::create(
+      SectionName, SectionContent, Obj.isLittleEndian(), Obj.is64Bit());
+  if (!Decompressor)
+    reportWarning(Decompressor.takeError(), Obj.getFileName());
+  else if (auto Err = Decompressor->resizeAndDecompress(Out))
+    reportWarning(std::move(Err), Obj.getFileName());
+  else
+    SectionContent = Out;
+}
+
 void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj,
-                                      ArrayRef<std::string> Sections) {
+                                      ArrayRef<std::string> Sections,
+                                      bool Decompress) {
+  SmallString<0> Out;
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
@@ -156,12 +172,16 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj,
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
+    if (Decompress && Section.isCompressed())
+      maybeDecompress(Obj, SectionName, SectionContent, Out);
     printAsStringList(SectionContent);
   }
 }
 
 void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
-                                   ArrayRef<std::string> Sections) {
+                                   ArrayRef<std::string> Sections,
+                                   bool Decompress) {
+  SmallString<0> Out;
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
@@ -174,6 +194,8 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
+    if (Decompress && Section.isCompressed())
+      maybeDecompress(Obj, SectionName, SectionContent, Out);
     const uint8_t *SecContent = SectionContent.bytes_begin();
     const uint8_t *SecEnd = SecContent + SectionContent.size();
 
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index 1d679453581bc8..3958dd3a333332 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -175,9 +175,9 @@ class ObjDumper {
   void printAsStringList(StringRef StringContent, size_t StringDataOffset = 0);
 
   void printSectionsAsString(const object::ObjectFile &Obj,
-                             ArrayRef<std::string> Sections);
+                             ArrayRef<std::string> Sections, bool Decompress);
   void printSectionsAsHex(const object::ObjectFile &Obj,
-                          ArrayRef<std::string> Sections);
+                          ArrayRef<std::string> Sections, bool Decompress);
 
   std::function<Error(const Twine &Msg)> WarningHandler;
   void reportUniqueWarning(Error Err) const;
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index e2d93c6ec229e9..018facc278e891 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -20,6 +20,7 @@ def all : FF<"all", "Equivalent to setting: --file-header, --program-headers, --
 def arch_specific : FF<"arch-specific", "Display architecture-specific information">;
 def bb_addr_map : FF<"bb-addr-map", "Display the BB address map section">;
 def cg_profile : FF<"cg-profile", "Display call graph profile section">;
+def decompress : FF<"decompress", "Dump decompressed section content when used with -x or -p">;
 defm demangle : BB<"demangle", "Demangle symbol names", "Do not demangle symbol names (default)">;
 def dependent_libraries : FF<"dependent-libraries", "Display the dependent libraries section">;
 def dyn_relocations : FF<"dyn-relocations", "Display the dynamic relocation entries in the file">;
@@ -139,3 +140,4 @@ def : F<"u", "Alias for --unwind">, Alias<unwind>;
 def : F<"X", "Alias for --extra-sym-info">, Alias<extra_sym_info>, Group<grp_elf>;
 def : F<"V", "Alias for --version-info">, Alias<version_info>, Group<grp_elf>;
 def : JoinedOrSeparate<["-"], "x">, Alias<hex_dump_EQ>, HelpText<"Alias for --hex-dump">, MetaVarName<"<name or index>">;
+def : F<"z", "Alias for --decompress">, Alias<decompress>;
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index f9d605d35244bf..979433d69011c3 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -97,6 +97,7 @@ static bool ArchSpecificInfo;
 static bool BBAddrMap;
 bool ExpandRelocs;
 static bool CGProfile;
+static bool Decompress;
 bool Demangle;
 static bool DependentLibraries;
 static bool DynRelocs;
@@ -212,6 +213,7 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::ArchSpecificInfo = Args.hasArg(OPT_arch_specific);
   opts::BBAddrMap = Args.hasArg(OPT_bb_addr_map);
   opts::CGProfile = Args.hasArg(OPT_cg_profile);
+  opts::Decompress = Args.hasArg(OPT_decompress);
   opts::Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, false);
   opts::DependentLibraries = Args.hasArg(OPT_dependent_libraries);
   opts::DynRelocs = Args.hasArg(OPT_dyn_relocations);
@@ -439,9 +441,9 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
     Dumper->printSymbols(opts::Symbols, opts::DynamicSymbols,
                          opts::ExtraSymInfo, SymComp);
   if (!opts::StringDump.empty())
-    Dumper->printSectionsAsString(Obj, opts::StringDump);
+    Dumper->printSectionsAsString(Obj, opts::StringDump, opts::Decompress);
   if (!opts::HexDump.empty())
-    Dumper->printSectionsAsHex(Obj, opts::HexDump);
+    Dumper->printSectionsAsHex(Obj, opts::HexDump, opts::Decompress);
   if (opts::HashTable)
     Dumper->printHashTable();
   if (opts::GnuHashTable)
diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt
index a47a0ec84c625c..b20ac318e768db 100644
--- a/llvm/tools/llvm-shlib/CMakeLists.txt
+++ b/llvm/tools/llvm-shlib/CMakeLists.txt
@@ -33,7 +33,13 @@ if(LLVM_BUILD_LLVM_DYLIB)
   if (LLVM_LINK_LLVM_DYLIB)
     set(INSTALL_WITH_TOOLCHAIN INSTALL_WITH_TOOLCHAIN)
   endif()
-  add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB SONAME ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
+  if (WIN32)
+    add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB SONAME ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
+  else()
+    add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB OUTPUT_NAME LLVM ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
+    # Add symlink for backwards compatibility with old library name
+    llvm_install_library_symlink(LLVM-${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} $<TARGET_FILE_NAME:LLVM> SHARED FULL_DEST COMPONENT LLVM)
+  endif()
 
   list(REMOVE_DUPLICATES LIB_NAMES)
   if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
index d7876b7ce87490..531360a697039c 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
@@ -1556,12 +1556,12 @@ TEST_F(AArch64GISelMITest, FewerElementsPhi) {
   CHECK: [[PHI0:%[0-9]+]]:_(<2 x s32>) = G_PHI [[INITVAL_E01]]:_(<2 x s32>), %bb.0, [[MIDVAL_E01]]:_(<2 x s32>), %bb.1
   CHECK: [[PHI1:%[0-9]+]]:_(<2 x s32>) = G_PHI [[INITVAL_E23]]:_(<2 x s32>), %bb.0, [[MIDVAL_E23]]:_(<2 x s32>), %bb.1
   CHECK: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[INITVAL_E4]]:_(s32), %bb.0, [[MIDVAL_E4]]:_(s32), %bb.1
-  CHECK: [[UNMERGE0:%[0-9]+]]:_(s32), [[UNMERGE1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PHI0]]:_(<2 x s32>)
-  CHECK: [[UNMERGE2:%[0-9]+]]:_(s32), [[UNMERGE3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PHI1]]:_(<2 x s32>)
-  CHECK: [[BV:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UNMERGE0]]:_(s32), [[UNMERGE1]]:_(s32), [[UNMERGE2]]:_(s32), [[UNMERGE3]]:_(s32), [[PHI2]]:_(s32)
 
   CHECK: [[OTHER_PHI:%[0-9]+]]:_(s64) = G_PHI
 
+  CHECK: [[UNMERGE0:%[0-9]+]]:_(s32), [[UNMERGE1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PHI0]]:_(<2 x s32>)
+  CHECK: [[UNMERGE2:%[0-9]+]]:_(s32), [[UNMERGE3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PHI1]]:_(<2 x s32>)
+  CHECK: [[BV:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UNMERGE0]]:_(s32), [[UNMERGE1]]:_(s32), [[UNMERGE2]]:_(s32), [[UNMERGE3]]:_(s32), [[PHI2]]:_(s32)
   CHECK: [[USE_OP:%[0-9]+]]:_(<5 x s32>) = G_AND [[BV]]:_, [[BV]]:_
   )";
 
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index e505af5d3275ef..f60bb4c135bec3 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -2479,6 +2479,24 @@ TEST_F(ConstantRangeTest, castOps) {
   ConstantRange IntToPtr = A.castOp(Instruction::IntToPtr, 64);
   EXPECT_EQ(64u, IntToPtr.getBitWidth());
   EXPECT_TRUE(IntToPtr.isFullSet());
+
+  ConstantRange UIToFP = A.castOp(Instruction::UIToFP, 16);
+  EXPECT_EQ(16u, UIToFP.getBitWidth());
+  EXPECT_TRUE(UIToFP.isFullSet());
+
+  ConstantRange UIToFP2 = A.castOp(Instruction::UIToFP, 64);
+  ConstantRange B(APInt(64, 0), APInt(64, 65536));
+  EXPECT_EQ(64u, UIToFP2.getBitWidth());
+  EXPECT_EQ(B, UIToFP2);
+
+  ConstantRange SIToFP = A.castOp(Instruction::SIToFP, 16);
+  EXPECT_EQ(16u, SIToFP.getBitWidth());
+  EXPECT_TRUE(SIToFP.isFullSet());
+
+  ConstantRange SIToFP2 = A.castOp(Instruction::SIToFP, 64);
+  ConstantRange C(APInt(64, -32768), APInt(64, 32768));
+  EXPECT_EQ(64u, SIToFP2.getBitWidth());
+  EXPECT_EQ(C, SIToFP2);
 }
 
 TEST_F(ConstantRangeTest, binaryAnd) {
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 23f66a0232ddbb..2849781a9dc43b 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -890,6 +890,42 @@ TEST_P(CoverageMappingTest, non_code_region_bitmask) {
   ASSERT_EQ(1U, Names.size());
 }
 
+// Test the order of MCDCDecision before Expansion
+TEST_P(CoverageMappingTest, decision_before_expansion) {
+  startFunction("foo", 0x1234);
+  addCMR(Counter::getCounter(0), "foo", 3, 23, 5, 2);
+
+  // This(4:11) was put after Expansion(4:11) before the fix
+  addMCDCDecisionCMR(0, 2, "foo", 4, 11, 4, 20);
+
+  addExpansionCMR("foo", "A", 4, 11, 4, 12);
+  addExpansionCMR("foo", "B", 4, 19, 4, 20);
+  addCMR(Counter::getCounter(0), "A", 1, 14, 1, 17);
+  addCMR(Counter::getCounter(0), "A", 1, 14, 1, 17);
+  addMCDCBranchCMR(Counter::getCounter(0), Counter::getCounter(1), 1, 2, 0, "A",
+                   1, 14, 1, 17);
+  addCMR(Counter::getCounter(1), "B", 1, 14, 1, 17);
+  addMCDCBranchCMR(Counter::getCounter(1), Counter::getCounter(2), 2, 0, 0, "B",
+                   1, 14, 1, 17);
+
+  // InputFunctionCoverageData::Regions is rewritten after the write.
+  auto InputRegions = InputFunctions.back().Regions;
+
+  writeAndReadCoverageRegions();
+
+  const auto &OutputRegions = OutputFunctions.back().Regions;
+
+  size_t N = ArrayRef(InputRegions).size();
+  ASSERT_EQ(N, OutputRegions.size());
+  for (size_t I = 0; I < N; ++I) {
+    ASSERT_EQ(InputRegions[I].Kind, OutputRegions[I].Kind);
+    ASSERT_EQ(InputRegions[I].FileID, OutputRegions[I].FileID);
+    ASSERT_EQ(InputRegions[I].ExpandedFileID, OutputRegions[I].ExpandedFileID);
+    ASSERT_EQ(InputRegions[I].startLoc(), OutputRegions[I].startLoc());
+    ASSERT_EQ(InputRegions[I].endLoc(), OutputRegions[I].endLoc());
+  }
+}
+
 TEST_P(CoverageMappingTest, strip_filename_prefix) {
   ProfileWriter.addRecord({"file1:func", 0x1234, {0}}, Err);
 
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index 9b7112fa2bfeb5..24ed3e2e4b0561 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -366,55 +366,51 @@ TEST(ParseArchString, RejectsDuplicateExtensionNames) {
 TEST(ParseArchString,
      RejectsExperimentalExtensionsIfNotEnableExperimentalExtension) {
   EXPECT_EQ(
-      toString(
-          RISCVISAInfo::parseArchString("rv64izicond", false).takeError()),
+      toString(RISCVISAInfo::parseArchString("rv64iztso", false).takeError()),
       "requires '-menable-experimental-extensions' for experimental extension "
-      "'zicond'");
+      "'ztso'");
 }
 
 TEST(ParseArchString,
      AcceptsExperimentalExtensionsIfEnableExperimentalExtension) {
-  // Note: If zicond becomes none-experimental, this test will need
+  // Note: If ztso becomes none-experimental, this test will need
   // updating (and unfortunately, it will still pass). The failure of
   // RejectsExperimentalExtensionsIfNotEnableExperimentalExtension will
   // hopefully serve as a reminder to update.
-  auto MaybeISAInfo =
-      RISCVISAInfo::parseArchString("rv64izicond", true, false);
+  auto MaybeISAInfo = RISCVISAInfo::parseArchString("rv64iztso", true, false);
   ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
   RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
   EXPECT_EQ(Exts.size(), 2UL);
-  EXPECT_EQ(Exts.count("zicond"), 1U);
-  auto MaybeISAInfo2 = RISCVISAInfo::parseArchString("rv64izicond1p0", true);
+  EXPECT_EQ(Exts.count("ztso"), 1U);
+  auto MaybeISAInfo2 = RISCVISAInfo::parseArchString("rv64iztso0p1", true);
   ASSERT_THAT_EXPECTED(MaybeISAInfo2, Succeeded());
   RISCVISAInfo::OrderedExtensionMap Exts2 = (*MaybeISAInfo2)->getExtensions();
   EXPECT_EQ(Exts2.size(), 2UL);
-  EXPECT_EQ(Exts2.count("zicond"), 1U);
+  EXPECT_EQ(Exts2.count("ztso"), 1U);
 }
 
 TEST(ParseArchString,
      RequiresExplicitVersionNumberForExperimentalExtensionByDefault) {
   EXPECT_EQ(
-      toString(
-          RISCVISAInfo::parseArchString("rv64izicond", true).takeError()),
-      "experimental extension requires explicit version number `zicond`");
+      toString(RISCVISAInfo::parseArchString("rv64iztso", true).takeError()),
+      "experimental extension requires explicit version number `ztso`");
 }
 
 TEST(ParseArchString,
      AcceptsUnrecognizedVersionIfNotExperimentalExtensionVersionCheck) {
   auto MaybeISAInfo =
-      RISCVISAInfo::parseArchString("rv64izicond9p9", true, false);
+      RISCVISAInfo::parseArchString("rv64iztso9p9", true, false);
   ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
   RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
   EXPECT_EQ(Exts.size(), 2UL);
-  EXPECT_TRUE(Exts.at("zicond") == (RISCVISAInfo::ExtensionVersion{9, 9}));
+  EXPECT_TRUE(Exts.at("ztso") == (RISCVISAInfo::ExtensionVersion{9, 9}));
 }
 
 TEST(ParseArchString, RejectsUnrecognizedVersionForExperimentalExtension) {
   EXPECT_EQ(
-      toString(
-          RISCVISAInfo::parseArchString("rv64izicond9p9", true).takeError()),
-      "unsupported version number 9.9 for experimental extension 'zicond' "
-      "(this compiler supports 1.0)");
+      toString(RISCVISAInfo::parseArchString("rv64iztso9p9", true).takeError()),
+      "unsupported version number 9.9 for experimental extension 'ztso' "
+      "(this compiler supports 0.1)");
 }
 
 TEST(ParseArchString, RejectsExtensionVersionForG) {
@@ -489,16 +485,16 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
 
 TEST(ToFeatures, IIsDroppedAndExperimentalExtensionsArePrefixed) {
   auto MaybeISAInfo1 =
-      RISCVISAInfo::parseArchString("rv64im_zicond", true, false);
+      RISCVISAInfo::parseArchString("rv64im_ztso", true, false);
   ASSERT_THAT_EXPECTED(MaybeISAInfo1, Succeeded());
   EXPECT_THAT((*MaybeISAInfo1)->toFeatures(),
-              ElementsAre("+m", "+experimental-zicond"));
+              ElementsAre("+m", "+experimental-ztso"));
 
-  auto MaybeISAInfo2 = RISCVISAInfo::parseArchString(
-      "rv32e_zicond_xventanacondops", true, false);
+  auto MaybeISAInfo2 =
+      RISCVISAInfo::parseArchString("rv32e_ztso_xventanacondops", true, false);
   ASSERT_THAT_EXPECTED(MaybeISAInfo2, Succeeded());
   EXPECT_THAT((*MaybeISAInfo2)->toFeatures(),
-              ElementsAre("+e", "+experimental-zicond", "+xventanacondops"));
+              ElementsAre("+e", "+experimental-ztso", "+xventanacondops"));
 }
 
 TEST(ToFeatures, UnsupportedExtensionsAreDropped) {
@@ -649,10 +645,10 @@ TEST(isSupportedExtensionWithVersion, AcceptsSingleExtensionWithVersion) {
 
 TEST(getTargetFeatureForExtension, RetrieveTargetFeatureFromOneExt) {
   EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zbb"), "zbb");
-  EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zicond1p0"),
-            "experimental-zicond");
-  EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zicond"),
-            "experimental-zicond");
+  EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("ztso0p1"),
+            "experimental-ztso");
+  EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("ztso"),
+            "experimental-ztso");
   EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zihintntl1234p4321"),
             "");
   EXPECT_EQ(RISCVISAInfo::getTargetFeatureForExtension("zfoo"), "");
@@ -684,6 +680,7 @@ R"(All available -march extensions for RISC-V
     zicclsm             1.0
     ziccrse             1.0
     zicntr              2.0
+    zicond              1.0
     zicsr               2.0
     zifencei            2.0
     zihintntl           1.0
@@ -793,7 +790,6 @@ R"(All available -march extensions for RISC-V
 Experimental extensions
     zicfilp             0.4       This is a long dummy description
     zicfiss             0.4
-    zicond              1.0
     zimop               0.1
     zacas               1.0
     zfbfmin             1.0
diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
index 5f151616d7ca6a..6aa1d7a087ebf0 100644
--- a/llvm/unittests/TargetParser/Host.cpp
+++ b/llvm/unittests/TargetParser/Host.cpp
@@ -122,6 +122,9 @@ TEST(getLinuxHostCPUName, AArch64) {
   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n"
                                               "CPU part        : 0xac4"),
             "ampere1a");
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n"
+                                              "CPU part        : 0xac5"),
+            "ampere1b");
 
   // MSM8992/4 weirdness
   StringRef MSM8992ProcCpuInfo = R"(
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index d31fcd1bb1b00d..131741ff7fd07a 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -133,8 +133,8 @@ template <ARM::ISAKind ISAKind> struct AssertSameExtensionFlags {
 
     return testing::AssertionFailure() << llvm::formatv(
                "CPU: {4}\n"
-               "Expected extension flags: {0} ({1:x})\n"
-               "     Got extension flags: {2} ({3:x})\n",
+               "Expected extension flags: {0} ({1})\n"
+               "     Got extension flags: {2} ({3})\n",
                FormatExtensionFlags(ExpectedFlags),
                SerializeExtensionFlags(ExpectedFlags),
                FormatExtensionFlags(GotFlags),
@@ -1260,7 +1260,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_AES,     AArch64::AEK_SHA2,  AArch64::AEK_SHA3,
                  AArch64::AEK_SM4,     AArch64::AEK_FP16,  AArch64::AEK_BF16,
                  AArch64::AEK_PROFILE, AArch64::AEK_RAND,  AArch64::AEK_FP16FML,
-                 AArch64::AEK_I8MM,    AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_I8MM,    AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-v2", "armv9-a", "neon-fp-armv8",
@@ -1275,7 +1276,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SVE2,        AArch64::AEK_PROFILE,
                  AArch64::AEK_FP16FML,     AArch64::AEK_I8MM,
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_RAND,
-                 AArch64::AEK_JSCVT,       AArch64::AEK_FCMA})),
+                 AArch64::AEK_JSCVT,       AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-r82", "armv8-r", "crypto-neon-fp-armv8",
@@ -1284,7 +1286,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_DOTPROD, AArch64::AEK_FP, AArch64::AEK_SIMD,
                  AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_RAS,
                  AArch64::AEK_RCPC, AArch64::AEK_LSE, AArch64::AEK_SB,
-                 AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_PAUTH})),
             "8-R"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-x1", "armv8.2-a", "crypto-neon-fp-armv8",
@@ -1389,7 +1391,8 @@ INSTANTIATE_TEST_SUITE_P(
                 {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2,
                  AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_LSE,
                  AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC,
-                 AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.3-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-a13", "armv8.4-a", "crypto-neon-fp-armv8",
@@ -1399,7 +1402,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM,
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA})),
+                 AArch64::AEK_FCMA, AArch64::AEK_PAUTH})),
             "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-a14", "armv8.5-a", "crypto-neon-fp-armv8",
@@ -1409,7 +1412,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM,
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA})),
+                 AArch64::AEK_FCMA, AArch64::AEK_PAUTH})),
             "8.5-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-a15", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1419,7 +1422,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM,
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16,
-                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-a16", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1429,7 +1433,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM,
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16,
-                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-a17", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1439,7 +1444,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM,
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16,
-                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-m1", "armv8.5-a", "crypto-neon-fp-armv8",
@@ -1449,7 +1455,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM,
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA})),
+                 AArch64::AEK_FCMA, AArch64::AEK_PAUTH})),
             "8.5-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-m2", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1459,7 +1465,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM,
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16,
-                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-m3", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1469,7 +1476,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_LSE, AArch64::AEK_RAS, AArch64::AEK_RDM,
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_BF16,
-                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_I8MM, AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-s4", "armv8.3-a", "crypto-neon-fp-armv8",
@@ -1477,7 +1485,8 @@ INSTANTIATE_TEST_SUITE_P(
                 {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2,
                  AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_LSE,
                  AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC,
-                 AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.3-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-s5", "armv8.3-a", "crypto-neon-fp-armv8",
@@ -1485,7 +1494,8 @@ INSTANTIATE_TEST_SUITE_P(
                 {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2,
                  AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_LSE,
                  AArch64::AEK_RAS, AArch64::AEK_RDM, AArch64::AEK_RCPC,
-                 AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_FP16, AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.3-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "exynos-m3", "armv8-a", "crypto-neon-fp-armv8",
@@ -1550,7 +1560,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SB,          AArch64::AEK_SVE2,
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_BF16,
                  AArch64::AEK_I8MM,        AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA})),
+                 AArch64::AEK_FCMA,        AArch64::AEK_PAUTH})),
             "8.5-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "ampere1", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1561,7 +1571,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SHA3, AArch64::AEK_BF16, AArch64::AEK_SHA2,
                  AArch64::AEK_AES, AArch64::AEK_I8MM, AArch64::AEK_SSBS,
                  AArch64::AEK_SB, AArch64::AEK_RAND, AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA})),
+                 AArch64::AEK_FCMA, AArch64::AEK_PAUTH})),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "ampere1a", "armv8.6-a", "crypto-neon-fp-armv8",
@@ -1572,8 +1582,21 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SM4, AArch64::AEK_SHA3, AArch64::AEK_BF16,
                  AArch64::AEK_SHA2, AArch64::AEK_AES, AArch64::AEK_I8MM,
                  AArch64::AEK_SSBS, AArch64::AEK_SB, AArch64::AEK_RAND,
-                 AArch64::AEK_MTE, AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_MTE, AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.6-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "ampere1b", "armv8.7-a", "crypto-neon-fp-armv8",
+            (AArch64::ExtensionBitset(
+                {AArch64::AEK_CRC,   AArch64::AEK_FP,    AArch64::AEK_FP16,
+                 AArch64::AEK_SIMD,  AArch64::AEK_RAS,   AArch64::AEK_LSE,
+                 AArch64::AEK_RDM,   AArch64::AEK_RCPC,  AArch64::AEK_DOTPROD,
+                 AArch64::AEK_SM4,   AArch64::AEK_SHA3,  AArch64::AEK_BF16,
+                 AArch64::AEK_SHA2,  AArch64::AEK_AES,   AArch64::AEK_I8MM,
+                 AArch64::AEK_SSBS,  AArch64::AEK_SB,    AArch64::AEK_RAND,
+                 AArch64::AEK_MTE,   AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH, AArch64::AEK_CSSC})),
+            "8.7-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-512tvb", "armv8.4-a", "crypto-neon-fp-armv8",
             (AArch64::ExtensionBitset(
@@ -1584,7 +1607,8 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_AES,     AArch64::AEK_SHA2,  AArch64::AEK_SHA3,
                  AArch64::AEK_SM4,     AArch64::AEK_FP16,  AArch64::AEK_BF16,
                  AArch64::AEK_PROFILE, AArch64::AEK_RAND,  AArch64::AEK_FP16FML,
-                 AArch64::AEK_I8MM,    AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_I8MM,    AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH})),
             "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "thunderx2t99", "armv8.1-a", "crypto-neon-fp-armv8",
@@ -1599,7 +1623,7 @@ INSTANTIATE_TEST_SUITE_P(
                 {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2,
                  AArch64::AEK_LSE, AArch64::AEK_RDM, AArch64::AEK_FP,
                  AArch64::AEK_SIMD, AArch64::AEK_RAS, AArch64::AEK_RCPC,
-                 AArch64::AEK_JSCVT, AArch64::AEK_FCMA})),
+                 AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_PAUTH})),
             "8.3-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "thunderx", "armv8-a", "crypto-neon-fp-armv8",
@@ -1651,7 +1675,7 @@ INSTANTIATE_TEST_SUITE_P(
             "8.2-A")));
 
 // Note: number of CPUs includes aliases.
-static constexpr unsigned NumAArch64CPUArchs = 67;
+static constexpr unsigned NumAArch64CPUArchs = 69;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;
diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Attributes.cpp
index 474042a3e9a331..db3c4decccb4cf 100644
--- a/llvm/utils/TableGen/Attributes.cpp
+++ b/llvm/utils/TableGen/Attributes.cpp
@@ -87,7 +87,11 @@ void Attributes::emitFnAttrCompatCheck(raw_ostream &OS, bool IsStringAttr) {
 
   for (auto *Rule : CompatRules) {
     StringRef FuncName = Rule->getValueAsString("CompatFunc");
-    OS << "  Ret &= " << FuncName << "(Caller, Callee);\n";
+    OS << "  Ret &= " << FuncName << "(Caller, Callee";
+    StringRef AttrName = Rule->getValueAsString("AttrName");
+    if (!AttrName.empty())
+      OS << ", \"" << AttrName << "\"";
+    OS << ");\n";
   }
 
   OS << "\n";
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 455183987b7b27..50156d34528c15 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -57,7 +57,8 @@ class MatcherTableEmitter {
 
   // We de-duplicate the predicates by code string, and use this map to track
   // all the patterns with "identical" predicates.
-  StringMap<TinyPtrVector<TreePattern *>> NodePredicatesByCodeToRun;
+  MapVector<std::string, TinyPtrVector<TreePattern *>, StringMap<unsigned>>
+      NodePredicatesByCodeToRun;
 
   std::vector<std::string> PatternPredicates;
 
diff --git a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
index 78dcd4471ae747..7f494e532b1f44 100644
--- a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
+++ b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
@@ -152,8 +152,7 @@ void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate,
         << "if (FirstDest.isVirtual() && !MRI.hasOneNonDBGUse(FirstDest))\n";
     OS.indent(4) << "  return false;\n";
     OS.indent(2) << "}\n";
-  } else if (Predicate->isSubClassOf(
-                 "FirstFusionPredicateWithMCInstPredicate")) {
+  } else if (Predicate->isSubClassOf("FusionPredicateWithMCInstPredicate")) {
     OS.indent(2) << "{\n";
     OS.indent(4) << "const MachineInstr *MI = FirstMI;\n";
     OS.indent(4) << "if (";
@@ -173,7 +172,7 @@ void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate,
 void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
                                                        PredicateExpander &PE,
                                                        raw_ostream &OS) {
-  if (Predicate->isSubClassOf("SecondFusionPredicateWithMCInstPredicate")) {
+  if (Predicate->isSubClassOf("FusionPredicateWithMCInstPredicate")) {
     OS.indent(2) << "{\n";
     OS.indent(4) << "const MachineInstr *MI = &SecondMI;\n";
     OS.indent(4) << "if (";
@@ -185,7 +184,7 @@ void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
     OS.indent(2) << "}\n";
   } else {
     PrintFatalError(Predicate->getLoc(),
-                    "Unsupported predicate for first instruction: " +
+                    "Unsupported predicate for second instruction: " +
                         Predicate->getType()->getAsString());
   }
 }
@@ -196,9 +195,8 @@ void MacroFusionPredicatorEmitter::emitBothPredicate(Record *Predicate,
   if (Predicate->isSubClassOf("FusionPredicateWithCode"))
     OS << Predicate->getValueAsString("Predicate");
   else if (Predicate->isSubClassOf("BothFusionPredicateWithMCInstPredicate")) {
-    Record *MCPred = Predicate->getValueAsDef("Predicate");
-    emitFirstPredicate(MCPred, PE, OS);
-    emitSecondPredicate(MCPred, PE, OS);
+    emitFirstPredicate(Predicate, PE, OS);
+    emitSecondPredicate(Predicate, PE, OS);
   } else if (Predicate->isSubClassOf("TieReg")) {
     int FirstOpIdx = Predicate->getValueAsInt("FirstOpIdx");
     int SecondOpIdx = Predicate->getValueAsInt("SecondOpIdx");
diff --git a/llvm/utils/TableGen/PredicateExpander.cpp b/llvm/utils/TableGen/PredicateExpander.cpp
index d3a73e02cd916f..0b9b6389fe3817 100644
--- a/llvm/utils/TableGen/PredicateExpander.cpp
+++ b/llvm/utils/TableGen/PredicateExpander.cpp
@@ -59,6 +59,30 @@ void PredicateExpander::expandCheckImmOperandSimple(raw_ostream &OS,
     OS << ")";
 }
 
+void PredicateExpander::expandCheckImmOperandLT(raw_ostream &OS, int OpIndex,
+                                                int ImmVal,
+                                                StringRef FunctionMapper) {
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
+  OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
+     << ").getImm()";
+  if (!FunctionMapper.empty())
+    OS << ")";
+  OS << (shouldNegate() ? " >= " : " < ") << ImmVal;
+}
+
+void PredicateExpander::expandCheckImmOperandGT(raw_ostream &OS, int OpIndex,
+                                                int ImmVal,
+                                                StringRef FunctionMapper) {
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
+  OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
+     << ").getImm()";
+  if (!FunctionMapper.empty())
+    OS << ")";
+  OS << (shouldNegate() ? " <= " : " > ") << ImmVal;
+}
+
 void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex,
                                               const Record *Reg,
                                               StringRef FunctionMapper) {
@@ -352,6 +376,16 @@ void PredicateExpander::expandPredicate(raw_ostream &OS, const Record *Rec) {
                                  Rec->getValueAsString("ImmVal"),
                                  Rec->getValueAsString("FunctionMapper"));
 
+  if (Rec->isSubClassOf("CheckImmOperandLT"))
+    return expandCheckImmOperandLT(OS, Rec->getValueAsInt("OpIndex"),
+                                   Rec->getValueAsInt("ImmVal"),
+                                   Rec->getValueAsString("FunctionMapper"));
+
+  if (Rec->isSubClassOf("CheckImmOperandGT"))
+    return expandCheckImmOperandGT(OS, Rec->getValueAsInt("OpIndex"),
+                                   Rec->getValueAsInt("ImmVal"),
+                                   Rec->getValueAsString("FunctionMapper"));
+
   if (Rec->isSubClassOf("CheckImmOperandSimple"))
     return expandCheckImmOperandSimple(OS, Rec->getValueAsInt("OpIndex"),
                                        Rec->getValueAsString("FunctionMapper"));
diff --git a/llvm/utils/TableGen/PredicateExpander.h b/llvm/utils/TableGen/PredicateExpander.h
index cfb0a3d51e6776..a0dc6302397883 100644
--- a/llvm/utils/TableGen/PredicateExpander.h
+++ b/llvm/utils/TableGen/PredicateExpander.h
@@ -61,6 +61,10 @@ class PredicateExpander {
                              StringRef FunctionMapperer);
   void expandCheckImmOperandSimple(raw_ostream &OS, int OpIndex,
                                    StringRef FunctionMapper);
+  void expandCheckImmOperandLT(raw_ostream &OS, int OpIndex, int ImmVal,
+                               StringRef FunctionMapper);
+  void expandCheckImmOperandGT(raw_ostream &OS, int OpIndex, int ImmVal,
+                               StringRef FunctionMapper);
   void expandCheckRegOperand(raw_ostream &OS, int OpIndex, const Record *Reg,
                              StringRef FunctionMapper);
   void expandCheckRegOperandSimple(raw_ostream &OS, int OpIndex,
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 8a860d0945bb1a..7ea02ecba324cb 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -14,6 +14,7 @@
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "X86RecognizableInstr.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/X86FoldTablesUtils.h"
 #include "llvm/TableGen/Record.h"
@@ -80,6 +81,7 @@ class X86FoldTablesEmitter {
     bool FoldStore = false;
     enum BcastType {
       BCAST_NONE,
+      BCAST_W,
       BCAST_D,
       BCAST_Q,
       BCAST_SS,
@@ -114,6 +116,9 @@ class X86FoldTablesEmitter {
       switch (BroadcastKind) {
       case BCAST_NONE:
         break;
+      case BCAST_W:
+        Attrs += "TB_BCAST_W|";
+        break;
       case BCAST_D:
         Attrs += "TB_BCAST_D|";
         break;
@@ -529,45 +534,22 @@ void X86FoldTablesEmitter::addBroadcastEntry(
   assert(Table.find(RegInst) == Table.end() && "Override entry unexpectedly");
   X86FoldTableEntry Result = X86FoldTableEntry(RegInst, MemInst);
 
-  Record *RegRec = RegInst->TheDef;
-  StringRef RegInstName = RegRec->getName();
-  StringRef MemInstName = MemInst->TheDef->getName();
-  Record *Domain = RegRec->getValueAsDef("ExeDomain");
-  bool IsSSEPackedInt = Domain->getName() == "SSEPackedInt";
-  if ((RegInstName.contains("DZ") || RegInstName.contains("DWZ") ||
-       RegInstName.contains("Dr") || RegInstName.contains("I32")) &&
-      IsSSEPackedInt) {
-    assert((MemInstName.contains("DZ") || RegInstName.contains("DWZ") ||
-            MemInstName.contains("Dr") || MemInstName.contains("I32")) &&
-           "Unmatched names for broadcast");
-    Result.BroadcastKind = X86FoldTableEntry::BCAST_D;
-  } else if ((RegInstName.contains("QZ") || RegInstName.contains("QBZ") ||
-              RegInstName.contains("Qr") || RegInstName.contains("I64")) &&
-             IsSSEPackedInt) {
-    assert((MemInstName.contains("QZ") || MemInstName.contains("QBZ") ||
-            MemInstName.contains("Qr") || MemInstName.contains("I64")) &&
-           "Unmatched names for broadcast");
-    Result.BroadcastKind = X86FoldTableEntry::BCAST_Q;
-  } else if ((RegInstName.contains("PS") || RegInstName.contains("F32") ||
-              RegInstName.contains("CPH")) &&
-             !RegInstName.contains("PH2PS")) {
-    assert((MemInstName.contains("PS") || MemInstName.contains("F32") ||
-            MemInstName.contains("CPH")) &&
-           "Unmatched names for broadcast");
-    Result.BroadcastKind = X86FoldTableEntry::BCAST_SS;
-  } else if ((RegInstName.contains("PD") || RegInstName.contains("F64")) &&
-             !RegInstName.contains("PH2PD")) {
-    assert((MemInstName.contains("PD") || MemInstName.contains("F64")) &&
-           "Unmatched names for broadcast");
-    Result.BroadcastKind = X86FoldTableEntry::BCAST_SD;
-  } else if (RegInstName.contains("PH")) {
-    assert(MemInstName.contains("PH") && "Unmatched names for broadcast");
-    Result.BroadcastKind = X86FoldTableEntry::BCAST_SH;
-  } else {
-    errs() << RegInstName << ", " << MemInstName << "\n";
-    llvm_unreachable("Name is not canoicalized for broadcast or "
-                     "ExeDomain is incorrect");
+  DagInit *In = MemInst->TheDef->getValueAsDag("InOperandList");
+  for (unsigned I = 0, E = In->getNumArgs(); I != E; ++I) {
+    Result.BroadcastKind =
+        StringSwitch<X86FoldTableEntry::BcastType>(In->getArg(I)->getAsString())
+            .Case("i16mem", X86FoldTableEntry::BCAST_W)
+            .Case("i32mem", X86FoldTableEntry::BCAST_D)
+            .Case("i64mem", X86FoldTableEntry::BCAST_Q)
+            .Case("f16mem", X86FoldTableEntry::BCAST_SH)
+            .Case("f32mem", X86FoldTableEntry::BCAST_SS)
+            .Case("f64mem", X86FoldTableEntry::BCAST_SD)
+            .Default(X86FoldTableEntry::BCAST_NONE);
+    if (Result.BroadcastKind != X86FoldTableEntry::BCAST_NONE)
+      break;
   }
+  assert(Result.BroadcastKind != X86FoldTableEntry::BCAST_NONE &&
+         "Unknown memory operand for broadcast");
 
   Table[RegInst] = Result;
 }
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index f1137591766125..e55c1ed3d7a595 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
 llvm_version_major = 18
-llvm_version_minor = 0
+llvm_version_minor = 1
 llvm_version_patch = 0
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index 1eea0887f1d119..1cfcc7d37813bc 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Daniel Dunbar"
 __email__ = "daniel@minormatter.com"
-__versioninfo__ = (18, 0, 0)
+__versioninfo__ = (18, 1, 5)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
 __all__ = []
diff --git a/llvm/utils/release/github-upload-release.py b/llvm/utils/release/github-upload-release.py
index a8bb569d2fc999..8343dee937f78f 100755
--- a/llvm/utils/release/github-upload-release.py
+++ b/llvm/utils/release/github-upload-release.py
@@ -77,20 +77,28 @@ def upload_files(repo, release, files):
 parser.add_argument("--token", type=str)
 parser.add_argument("--release", type=str)
 parser.add_argument("--user", type=str)
+parser.add_argument("--user-token", type=str)
 
 # Upload args
 parser.add_argument("--files", nargs="+", type=str)
 
 args = parser.parse_args()
 
-github = github.Github(args.token)
-llvm_org = github.get_organization("llvm")
+gh = github.Github(args.token)
+llvm_org = gh.get_organization("llvm")
 llvm_repo = llvm_org.get_repo("llvm-project")
 
 if args.user:
+    if not args.user_token:
+        print("--user-token option required when --user is used")
+        sys.exit(1)
     # Validate that this user is allowed to modify releases.
-    user = github.get_user(args.user)
-    team = llvm_org.get_team_by_slug("llvm-release-managers")
+    user = gh.get_user(args.user)
+    team = (
+        github.Github(args.user_token)
+        .get_organization("llvm")
+        .get_team_by_slug("llvm-release-managers")
+    )
     if not team.has_in_members(user):
         print("User {} is not a allowed to modify releases".format(args.user))
         sys.exit(1)
@@ -99,6 +107,6 @@ def upload_files(repo, release, files):
     sys.exit(1)
 
 if args.command == "create":
-    create_release(llvm_repo, args.release, args.user)
+    create_release(llvm_repo, args.release)
 if args.command == "upload":
     upload_files(llvm_repo, args.release, args.files)
diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh
index 5b1945df47d24a..4314b565e11b03 100755
--- a/llvm/utils/release/test-release.sh
+++ b/llvm/utils/release/test-release.sh
@@ -532,11 +532,16 @@ function build_llvmCore() {
       BuildTarget="clang"
       InstallTarget="install-clang install-clang-resource-headers"
       # compiler-rt builtins is needed on AIX to have a functional Phase 1 clang.
-      if [ "$System" = "AIX" -o "$Phase" != "1" ]; then
+      if [ "$System" = "AIX" ]; then
         BuildTarget="$BuildTarget runtimes"
-        InstallTarget="$InstallTarget install-runtimes"
+        InstallTarget="$InstallTarget install-builtins"
       fi
     fi
+    if [ "$Phase" -eq "3" ]; then
+      # Build everything at once, with the proper parallelism and verbosity,
+      # in Phase 3.
+      BuildTarget=
+    fi
 
     cd $ObjDir
     echo "# Compiling llvm $Release-$RC $Flavor"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 516a984399ff81..638e46a2f9c752 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -253,22 +253,23 @@ def ROCDL_mfma_f32_32x32x16_fp8_fp8 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.fp8.f
 
 //===---------------------------------------------------------------------===//
 // WMMA intrinsics
-class ROCDL_Wmma_IntrOp<string mnemonic, list<Trait> traits = []> :
+class ROCDL_Wmma_IntrOp<string mnemonic, list<int> overloadedOperands,
+                        list<Trait> traits = []> :
   LLVM_IntrOpBase<ROCDL_Dialect, mnemonic,
                   "amdgcn_" # !subst(".","_", mnemonic),
-                  [0], [], traits, 1>,
+                  [0], overloadedOperands, traits, 1>,
   Arguments<(ins Variadic<LLVM_Type>:$args)> {
   let assemblyFormat =
     "$args attr-dict `:` functional-type($args, $res)";
 }
 
 // Available on RDNA3
-def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16">;
-def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16">;
-def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16">;
-def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16">;
-def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">;
-def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">;
+def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16", [0]>;
+def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16", [0]>;
+def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16", [0]>;
+def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16", [0]>;
+def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8", [1]>;
+def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4", [1]>;
 
 //===---------------------------------------------------------------------===//
 // Operations on raw buffer resources (stride of 0, bounds checks either off or in
diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
index 45f29f37dd3b97..50f6f6de5c2897 100644
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -281,7 +281,11 @@ class Dialect {
   /// Register a set of type classes with this dialect.
   template <typename... Args>
   void addTypes() {
-    (addType<Args>(), ...);
+    // This initializer_list argument pack expansion is essentially equal to
+    // using a fold expression with a comma operator. Clang however, refuses
+    // to compile a fold expression with a depth of more than 256 by default.
+    // There seem to be no such limitations for initializer_list.
+    (void)std::initializer_list<int>{0, (addType<Args>(), 0)...};
   }
 
   /// Register a type instance with this dialect.
@@ -292,7 +296,11 @@ class Dialect {
   /// Register a set of attribute classes with this dialect.
   template <typename... Args>
   void addAttributes() {
-    (addAttribute<Args>(), ...);
+    // This initializer_list argument pack expansion is essentially equal to
+    // using a fold expression with a comma operator. Clang however, refuses
+    // to compile a fold expression with a depth of more than 256 by default.
+    // There seem to be no such limitations for initializer_list.
+    (void)std::initializer_list<int>{0, (addAttribute<Args>(), 0)...};
   }
 
   /// Register an attribute instance with this dialect.
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index ae2bd8e5b5405d..73d418cb841327 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -529,7 +529,8 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
                                       /*alignment=*/0);
   for (auto [index, arg] : llvm::enumerate(args)) {
     Value ptr = rewriter.create<LLVM::GEPOp>(
-        loc, ptrType, structType, tempAlloc, ArrayRef<LLVM::GEPArg>{0, index});
+        loc, ptrType, structType, tempAlloc,
+        ArrayRef<LLVM::GEPArg>{0, static_cast<int32_t>(index)});
     rewriter.create<LLVM::StoreOp>(loc, arg, ptr);
   }
   std::array<Value, 2> printfArgs = {stringStart, tempAlloc};
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index f853d5c47b623c..78d4e806246872 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -1041,13 +1041,14 @@ Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
   auto arrayPtr = builder.create<LLVM::AllocaOp>(
       loc, llvmPointerType, llvmPointerType, arraySize, /*alignment=*/0);
   for (const auto &en : llvm::enumerate(arguments)) {
+    const auto index = static_cast<int32_t>(en.index());
     Value fieldPtr =
         builder.create<LLVM::GEPOp>(loc, llvmPointerType, structType, structPtr,
-                                    ArrayRef<LLVM::GEPArg>{0, en.index()});
+                                    ArrayRef<LLVM::GEPArg>{0, index});
     builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
-    auto elementPtr = builder.create<LLVM::GEPOp>(
-        loc, llvmPointerType, llvmPointerType, arrayPtr,
-        ArrayRef<LLVM::GEPArg>{en.index()});
+    auto elementPtr =
+        builder.create<LLVM::GEPOp>(loc, llvmPointerType, llvmPointerType,
+                                    arrayPtr, ArrayRef<LLVM::GEPArg>{index});
     builder.create<LLVM::StoreOp>(loc, fieldPtr, elementPtr);
   }
   return arrayPtr;
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
index 72f9295749a66b..b25c831bc7172a 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
@@ -488,7 +488,8 @@ static void splitVectorStore(const DataLayout &dataLayout, Location loc,
     // Other patterns will turn this into a type-consistent GEP.
     auto gepOp = rewriter.create<GEPOp>(
         loc, address.getType(), rewriter.getI8Type(), address,
-        ArrayRef<GEPArg>{storeOffset + index * elementSize});
+        ArrayRef<GEPArg>{
+            static_cast<int32_t>(storeOffset + index * elementSize)});
 
     rewriter.create<StoreOp>(loc, extractOp, gepOp);
   }
@@ -524,9 +525,9 @@ static void splitIntegerStore(const DataLayout &dataLayout, Location loc,
 
     // We create an `i8` indexed GEP here as that is the easiest (offset is
     // already known). Other patterns turn this into a type-consistent GEP.
-    auto gepOp =
-        rewriter.create<GEPOp>(loc, address.getType(), rewriter.getI8Type(),
-                               address, ArrayRef<GEPArg>{currentOffset});
+    auto gepOp = rewriter.create<GEPOp>(
+        loc, address.getType(), rewriter.getI8Type(), address,
+        ArrayRef<GEPArg>{static_cast<int32_t>(currentOffset)});
     rewriter.create<StoreOp>(loc, valueToStore, gepOp);
 
     // No need to care about padding here since we already checked previously
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 140bdd1f2db361..be875297fc93ca 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2092,6 +2092,7 @@ DiagnosedSilenceableFailure transform::ConvertToLoopsOp::applyToOne(
       scf::lowerToLoopsUsingSCFForOp(rewriter, target);
   if (failed(loops))
     return emitDefaultDefiniteFailure(target);
+  rewriter.eraseOp(target);
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir
index 7969de0d456bb6..1b2c553b25ded0 100644
--- a/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir
@@ -33,6 +33,7 @@ module attributes {transform.with_named_sequence} {
 //       CHECK:         %[[MULF:.+]] = arith.mulf %[[LHS]], %[[RHS]]
 //       CHECK:         %[[ADDF:.+]] = arith.addf %[[OUT]], %[[MULF]]
 //       CHECK:         memref.store %[[ADDF]], %[[ARG2]][%[[IV0]], %[[IV1]]]
+//   CHECK-NOT:   linalg.matmul ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>)
 
 // -----
 
diff --git a/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir b/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir
new file mode 100644
index 00000000000000..f8d082082117cb
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// Decoding the attribute does not work on big-endian platforms currently
+// XFAIL: target=s390x-{{.*}}
+
+// CHECK{LITERAL}: @dense_resource_tensor_constant = internal constant [5 x float] [float 0x3FCA034080000000, float 0xBFD0466300000000, float 0xBFD75DDF80000000, float 0xBFDE074F40000000, float 0x3FDDD3A1C0000000]
+llvm.mlir.global internal constant @dense_resource_tensor_constant(dense_resource<dense_resource_test_5xf32> : tensor<5xf32>) : !llvm.array<5 x f32>
+
+// CHECK{LITERAL}: @dense_resource_vector_constant = internal constant <5 x float> <float 0x3FCA034080000000, float 0xBFD0466300000000, float 0xBFD75DDF80000000, float 0xBFDE074F40000000, float 0x3FDDD3A1C0000000>
+llvm.mlir.global internal constant @dense_resource_vector_constant(dense_resource<dense_resource_test_5xf32> : vector<5xf32>) : vector<5xf32>
+
+
+// CHECK{LITERAL}: @dense_resource_multidim_tensor_constant = internal constant [1 x [2 x [2 x float]]] [[2 x [2 x float]] [[2 x float] [float 0x3FD6B46A80000000, float 0x3FD6781AC0000000], [2 x float] [float 0xBFB45A2AA0000000, float 0x3FD77A5CA0000000]]]
+llvm.mlir.global internal constant @dense_resource_multidim_tensor_constant(dense_resource<dense_resource_test_2x2xf32> : tensor<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x !llvm.array<2 x f32>>>
+
+// CHECK{LITERAL}: @dense_resource_multidim_vector_constant = internal constant [1 x [2 x <2 x float>]] [[2 x <2 x float>] [<2 x float> <float 0x3FD6B46A80000000, float 0x3FD6781AC0000000>, <2 x float> <float 0xBFB45A2AA0000000, float 0x3FD77A5CA0000000>]]
+llvm.mlir.global internal constant @dense_resource_multidim_vector_constant(dense_resource<dense_resource_test_2x2xf32> : vector<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x vector<2 x f32>>>
+
+// Resources are kept at end of file. New tests should be added above this.
+{-#
+  dialect_resources: {
+    builtin: {
+      dense_resource_test_5xf32: "0x08000000041A503E183382BEFCEEBABE7A3AF0BE0E9DEE3E",
+      dense_resource_test_2x2xf32: "0x0800000054A3B53ED6C0B33E55D1A2BDE5D2BB3E"
+    }
+  }
+#-}
\ No newline at end of file
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 448aa3a5d85d79..961c9484446845 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -101,19 +101,6 @@ llvm.mlir.global internal @dense_float_vector_3d(dense<[[[1.0, 2.0], [3.0, 4.0]]
 // CHECK{LITERAL}: @splat_float_vector_3d = internal global [2 x [2 x <2 x float>]] [[2 x <2 x float>] [<2 x float> <float 4.200000e+01, float 4.200000e+01>, <2 x float> <float 4.200000e+01, float 4.200000e+01>], [2 x <2 x float>] [<2 x float> <float 4.200000e+01, float 4.200000e+01>, <2 x float> <float 4.200000e+01, float 4.200000e+01>]]
 llvm.mlir.global internal @splat_float_vector_3d(dense<42.0> : vector<2x2x2xf32>) : !llvm.array<2 x !llvm.array<2 x vector<2xf32>>>
 
-// CHECK{LITERAL}: @dense_resource_tensor_constant = internal constant [5 x float] [float 0x3FCA034080000000, float 0xBFD0466300000000, float 0xBFD75DDF80000000, float 0xBFDE074F40000000, float 0x3FDDD3A1C0000000]
-llvm.mlir.global internal constant @dense_resource_tensor_constant(dense_resource<dense_resource_test_5xf32> : tensor<5xf32>) : !llvm.array<5 x f32>
-
-// CHECK{LITERAL}: @dense_resource_vector_constant = internal constant <5 x float> <float 0x3FCA034080000000, float 0xBFD0466300000000, float 0xBFD75DDF80000000, float 0xBFDE074F40000000, float 0x3FDDD3A1C0000000>
-llvm.mlir.global internal constant @dense_resource_vector_constant(dense_resource<dense_resource_test_5xf32> : vector<5xf32>) : vector<5xf32>
-
-
-// CHECK{LITERAL}: @dense_resource_multidim_tensor_constant = internal constant [1 x [2 x [2 x float]]] [[2 x [2 x float]] [[2 x float] [float 0x3FD6B46A80000000, float 0x3FD6781AC0000000], [2 x float] [float 0xBFB45A2AA0000000, float 0x3FD77A5CA0000000]]]
-llvm.mlir.global internal constant @dense_resource_multidim_tensor_constant(dense_resource<dense_resource_test_2x2xf32> : tensor<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x !llvm.array<2 x f32>>>
-
-// CHECK{LITERAL}: @dense_resource_multidim_vector_constant = internal constant [1 x [2 x <2 x float>]] [[2 x <2 x float>] [<2 x float> <float 0x3FD6B46A80000000, float 0x3FD6781AC0000000>, <2 x float> <float 0xBFB45A2AA0000000, float 0x3FD77A5CA0000000>]]
-llvm.mlir.global internal constant @dense_resource_multidim_vector_constant(dense_resource<dense_resource_test_2x2xf32> : vector<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x vector<2 x f32>>>
-
 //
 // Linkage attribute.
 //
@@ -1590,16 +1577,6 @@ llvm.func @invokeLandingpad() -> i32 attributes { personality = @__gxx_personali
   llvm.invoke %9(%6, %0) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (ptr, ...)>) : !llvm.ptr, (!llvm.ptr, i32) -> ()
 }
 
-// Resources are kept at end of file. New tests should be added above this.
-{-#
-  dialect_resources: {
-    builtin: {
-      dense_resource_test_5xf32: "0x08000000041A503E183382BEFCEEBABE7A3AF0BE0E9DEE3E",
-      dense_resource_test_2x2xf32: "0x0800000054A3B53ED6C0B33E55D1A2BDE5D2BB3E"
-    }
-  }
-#-}
-
 // -----
 
 llvm.func @foo() -> i8
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3c9c70711ae230..26123300d74888 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -248,53 +248,53 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
   // ---- Wave32 -----
 
   // f16 -> f32
-  // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
   %r0 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg0 : (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
 
   // bf16 -> f32
-  // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
   %r1 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg0 : (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32>
 
   // f16 -> f16 (OPSEL = {0,1})
-  // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
+  // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
   %r2 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg1, %zero : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
 
   // bf16 -> bf16 (OPSEL = {0,1})
-  // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
+  // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
   %r4 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg2, %zero : (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16>
 
   // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
-  // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
+  // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
   %r5 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg3, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32>
 
   // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
-  // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
+  // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
   %r6 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
 
   // ---- Wave64 -----
 
   // f16 -> f32
-  // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
   %r7 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg6 : (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32>
 
   // bf16 -> f32
-  // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
   %r8 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg6 : (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32>
 
   // f16 -> f16 (OPSEL = {0,1})
-  // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
+  // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
   %r9 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg7, %zero : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16>
 
   // bf16 -> bf16 (OPSEL = {0,1})
-  // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
+  // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
   %r11 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg8, %zero : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16>
 
   // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
-  // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
+  // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
   %r12 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg5, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32>
 
   // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
-  // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
+  // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
   %r13 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg5, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<4xi32>, i1) -> vector<4xi32>
 
   llvm.return %r0 : vector<8xf32>
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 71326049af0579..7f748cfbd31ad4 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -3058,7 +3058,7 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder(
         body << llvm::formatv(
             "static_cast<int32_t>(std::accumulate({0}.begin(), {0}.end(), 0, "
             "[](int32_t curSum, ::mlir::ValueRange range) {{ return curSum + "
-            "range.size(); }))",
+            "static_cast<int32_t>(range.size()); }))",
             operandName);
       } else {
         body << "static_cast<int32_t>(" << getArgumentName(op, i) << ".size())";
diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake
index 201aeabbd3df9c..9387d9b3b0ff75 100644
--- a/openmp/cmake/HandleOpenMPOptions.cmake
+++ b/openmp/cmake/HandleOpenMPOptions.cmake
@@ -9,6 +9,14 @@ if (NOT COMMAND append_if)
   endfunction()
 endif()
 
+if (NOT COMMAND append)
+  function(append value)
+    foreach(variable ${ARGN})
+      set(${variable} "${${variable}} ${value}" PARENT_SCOPE)
+    endforeach(variable)
+  endfunction()
+endif()
+
 # MSVC and clang-cl in compatibility mode map -Wall to -Weverything.
 # TODO: LLVM adds /W4 instead, check if that works for the OpenMP runtimes.
 if (NOT MSVC)
@@ -38,7 +46,11 @@ append_if(OPENMP_HAVE_WEXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WPEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
-append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+if (NOT (WIN32 OR CYGWIN))
+  # This flag is not relevant on Windows; the flag is accepted, but produces warnings
+  # about argument unused during compilation.
+  append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
 append_if(OPENMP_HAVE_FUNCTION_SECTIONS "-ffunction-section" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_DATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst
index 3eeaf5c900d800..a5b39f61b0b64c 100644
--- a/openmp/docs/ReleaseNotes.rst
+++ b/openmp/docs/ReleaseNotes.rst
@@ -19,3 +19,5 @@ from the `LLVM releases web site <https://llvm.org/releases/>`_.
 
 Non-comprehensive list of changes in this release
 =================================================
+
+* SystemZ support added.
diff --git a/openmp/libomptarget/include/Shared/SourceInfo.h b/openmp/libomptarget/include/Shared/SourceInfo.h
index 7ce5fd43efc07f..711f06a04d017f 100644
--- a/openmp/libomptarget/include/Shared/SourceInfo.h
+++ b/openmp/libomptarget/include/Shared/SourceInfo.h
@@ -13,6 +13,7 @@
 #ifndef OMPTARGET_SHARED_SOURCE_INFO_H
 #define OMPTARGET_SHARED_SOURCE_INFO_H
 
+#include <stdint.h>
 #include <string>
 
 #ifdef _WIN32
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index c287a31e0b1b54..46ee4c9fba7109 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -818,6 +818,7 @@ class KMPAffinity {
 typedef KMPAffinity::Mask kmp_affin_mask_t;
 extern KMPAffinity *__kmp_affinity_dispatch;
 
+#ifndef KMP_OS_AIX
 class kmp_affinity_raii_t {
   kmp_affin_mask_t *mask;
   bool restored;
@@ -842,6 +843,7 @@ class kmp_affinity_raii_t {
   }
   ~kmp_affinity_raii_t() { restore(); }
 };
+#endif // !KMP_OS_AIX
 
 // Declare local char buffers with this size for printing debug and info
 // messages, using __kmp_affinity_print_mask().
@@ -1181,7 +1183,11 @@ extern void __kmp_init_target_task();
 #define KMP_MIN_STKSIZE ((size_t)(32 * 1024))
 #endif
 
+#if KMP_OS_AIX && KMP_ARCH_PPC
+#define KMP_MAX_STKSIZE 0x10000000 /* 256Mb max size on 32-bit AIX */
+#else
 #define KMP_MAX_STKSIZE (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
+#endif
 
 #if KMP_ARCH_X86
 #define KMP_DEFAULT_STKSIZE ((size_t)(2 * 1024 * 1024))
@@ -2494,14 +2500,15 @@ typedef struct kmp_dephash_entry kmp_dephash_entry_t;
 #define KMP_DEP_MTX 0x4
 #define KMP_DEP_SET 0x8
 #define KMP_DEP_ALL 0x80
-// Compiler sends us this info:
+// Compiler sends us this info. Note: some test cases contain an explicit copy
+// of this struct and should be in sync with any changes here.
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
   size_t len;
   union {
     kmp_uint8 flag; // flag as an unsigned char
     struct { // flag as a set of 8 bits
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
       /* Same fields as in the #else branch, but in reverse order */
       unsigned all : 1;
       unsigned unused : 3;
@@ -2666,7 +2673,7 @@ typedef struct kmp_task_stack {
 #endif // BUILD_TIED_TASK_STACK
 
 typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
   /* Same fields as in the #else branch, but in reverse order */
 #if OMPX_TASKGRAPH
   unsigned reserved31 : 6;
@@ -3906,7 +3913,7 @@ extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
 extern int __kmp_get_first_osid_with_ecore(void);
 #endif
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX
 extern int kmp_set_thread_affinity_mask_initial(void);
 #endif
 static inline void __kmp_assign_root_init_mask() {
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 6a41d34b023729..1ac541fbcaa707 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -2906,12 +2906,17 @@ static inline const char *__kmp_cpuinfo_get_envvar() {
 }
 
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
-// affinity map.
+// affinity map. On AIX, the map is obtained through system SRAD (Scheduler
+// Resource Allocation Domain).
 static bool __kmp_affinity_create_cpuinfo_map(int *line,
                                               kmp_i18n_id_t *const msg_id) {
+  *msg_id = kmp_i18n_null;
+
+#if KMP_OS_AIX
+  unsigned num_records = __kmp_xproc;
+#else
   const char *filename = __kmp_cpuinfo_get_filename();
   const char *envvar = __kmp_cpuinfo_get_envvar();
-  *msg_id = kmp_i18n_null;
 
   if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
@@ -2970,6 +2975,7 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
     return false;
   }
+#endif // KMP_OS_AIX
 
   // Allocate the array of records to store the proc info in.  The dummy
   // element at the end makes the logic in filling them out easier to code.
@@ -2999,6 +3005,99 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
     INIT_PROC_INFO(threadInfo[i]);
   }
 
+#if KMP_OS_AIX
+  int smt_threads;
+  lpar_info_format1_t cpuinfo;
+  unsigned num_avail = __kmp_xproc;
+
+  if (__kmp_affinity.flags.verbose)
+    KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
+
+  // Get the number of SMT threads per core.
+  int retval =
+      lpar_get_info(LPAR_INFO_FORMAT1, &cpuinfo, sizeof(lpar_info_format1_t));
+  if (!retval)
+    smt_threads = cpuinfo.smt_threads;
+  else {
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  // Allocate a resource set containing available system resourses.
+  rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
+  if (sys_rset == NULL) {
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+  // Allocate a resource set for the SRAD info.
+  rsethandle_t srad = rs_alloc(RS_EMPTY);
+  if (srad == NULL) {
+    rs_free(sys_rset);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  // Get the SRAD system detail level.
+  int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0);
+  if (sradsdl < 0) {
+    rs_free(sys_rset);
+    rs_free(srad);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+  // Get the number of RADs at that SRAD SDL.
+  int num_rads = rs_numrads(sys_rset, sradsdl, 0);
+  if (num_rads < 0) {
+    rs_free(sys_rset);
+    rs_free(srad);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  // Get the maximum number of procs that may be contained in a resource set.
+  int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0);
+  if (max_procs < 0) {
+    rs_free(sys_rset);
+    rs_free(srad);
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_UnknownTopology;
+    return false;
+  }
+
+  int cur_rad = 0;
+  int num_set = 0;
+  for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS;
+       ++srad_idx) {
+    // Check if the SRAD is available in the RSET.
+    if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0)
+      continue;
+
+    for (int cpu = 0; cpu < max_procs; cpu++) {
+      // Set the info for the cpu if it is in the SRAD.
+      if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) {
+        threadInfo[cpu][osIdIndex] = cpu;
+        threadInfo[cpu][pkgIdIndex] = cur_rad;
+        threadInfo[cpu][coreIdIndex] = cpu / smt_threads;
+        ++num_set;
+        if (num_set >= num_avail) {
+          // Done if all available CPUs have been set.
+          break;
+        }
+      }
+    }
+    ++cur_rad;
+  }
+  rs_free(sys_rset);
+  rs_free(srad);
+
+  // The topology is already sorted.
+
+#else // !KMP_OS_AIX
   unsigned num_avail = 0;
   *line = 0;
 #if KMP_ARCH_S390X
@@ -3246,6 +3345,8 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
   qsort(threadInfo, num_avail, sizeof(*threadInfo),
         __kmp_affinity_cmp_ProcCpuInfo_phys_id);
 
+#endif // KMP_OS_AIX
+
   // The table is now sorted by pkgId / coreId / threadId, but we really don't
   // know the radix of any of the fields. pkgId's may be sparsely assigned among
   // the chips on a system. Although coreId's are usually assigned
@@ -4441,7 +4542,7 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
     }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-#if KMP_OS_LINUX
+#if KMP_OS_LINUX || KMP_OS_AIX
     if (!success) {
       int line = 0;
       success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
@@ -4837,7 +4938,12 @@ void __kmp_affinity_uninitialize(void) {
   }
   if (__kmp_affin_origMask != NULL) {
     if (KMP_AFFINITY_CAPABLE()) {
+#if KMP_OS_AIX
+      // Uninitialize by unbinding the thread.
+      bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
       __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
+#endif
     }
     KMP_CPU_FREE(__kmp_affin_origMask);
     __kmp_affin_origMask = NULL;
@@ -5011,7 +5117,10 @@ void __kmp_affinity_bind_init_mask(int gtid) {
     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
   } else
 #endif
+#ifndef KMP_OS_AIX
+    // Do not set the full mask as the init mask on AIX.
     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+#endif
 }
 
 void __kmp_affinity_bind_place(int gtid) {
@@ -5124,7 +5233,7 @@ int __kmp_aux_set_affinity(void **mask) {
 int __kmp_aux_get_affinity(void **mask) {
   int gtid;
   int retval;
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
   kmp_info_t *th;
 #endif
   if (!KMP_AFFINITY_CAPABLE()) {
@@ -5132,7 +5241,7 @@ int __kmp_aux_get_affinity(void **mask) {
   }
 
   gtid = __kmp_entry_gtid();
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
   th = __kmp_threads[gtid];
 #else
   (void)gtid; // unused variable
@@ -5155,7 +5264,7 @@ int __kmp_aux_get_affinity(void **mask) {
     }
   }
 
-#if !KMP_OS_WINDOWS
+#if !KMP_OS_WINDOWS && !KMP_OS_AIX
 
   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
   KA_TRACE(
@@ -5175,7 +5284,7 @@ int __kmp_aux_get_affinity(void **mask) {
   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
   return 0;
 
-#endif /* KMP_OS_WINDOWS */
+#endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */
 }
 
 int __kmp_aux_get_affinity_max_proc() {
@@ -5557,7 +5666,7 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
   }
 }
 
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX
 // We don't need this entry for Windows because
 // there is GetProcessAffinityMask() api
 //
@@ -5592,7 +5701,11 @@ extern "C"
                 "set full mask for thread %d\n",
                 gtid));
   KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
+#if KMP_OS_AIX
+  return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
   return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
+#endif
 }
 #endif
 
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index 5464259784e2ba..1fb70491a9ede1 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -191,7 +191,7 @@ class KMPHwlocAffinity : public KMPAffinity {
 };
 #endif /* KMP_USE_HWLOC */
 
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX
 #if KMP_OS_LINUX
 /* On some of the older OS's that we build on, these constants aren't present
    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
@@ -314,6 +314,10 @@ class KMPHwlocAffinity : public KMPAffinity {
 #elif KMP_OS_FREEBSD
 #include <pthread.h>
 #include <pthread_np.h>
+#elif KMP_OS_AIX
+#include <sys/dr.h>
+#include <sys/rset.h>
+#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
 #endif
 class KMPNativeAffinity : public KMPAffinity {
   class Mask : public KMPAffinity::Mask {
@@ -401,6 +405,70 @@ class KMPNativeAffinity : public KMPAffinity {
         ++retval;
       return retval;
     }
+#if KMP_OS_AIX
+    // On AIX, we don't have a way to get CPU(s) a thread is bound to.
+    // This routine is only used to get the full mask.
+    int get_system_affinity(bool abort_on_error) override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+
+      (void)abort_on_error;
+
+      // Set the mask with all CPUs that are available.
+      for (int i = 0; i < __kmp_xproc; ++i)
+        KMP_CPU_SET(i, this);
+      return 0;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+
+                  "Illegal set affinity operation when not capable");
+
+      int location;
+      int gtid = __kmp_entry_gtid();
+      int tid = thread_self();
+
+      // Unbind the thread if it was bound to any processors before so that
+      // we can bind the thread to CPUs specified by the mask not others.
+      int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
+
+      // On AIX, we can only bind to one instead of a set of CPUs with the
+      // bindprocessor() system call.
+      KMP_CPU_SET_ITERATE(location, this) {
+        if (KMP_CPU_ISSET(location, this)) {
+          retval = bindprocessor(BINDTHREAD, tid, location);
+          if (retval == -1 && errno == 1) {
+            rsid_t rsid;
+            rsethandle_t rsh;
+            // Put something in rsh to prevent compiler warning
+            // about uninitalized use
+            rsh = rs_alloc(RS_EMPTY);
+            rsid.at_pid = getpid();
+            if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
+              retval = ra_detachrset(R_PROCESS, rsid, 0);
+              retval = bindprocessor(BINDTHREAD, tid, location);
+            }
+          }
+          if (retval == 0) {
+            KA_TRACE(10, ("__kmp_set_system_affinity:  Done binding "
+                          "T#%d to cpu=%d.\n",
+                          gtid, location));
+            continue;
+          }
+          int error = errno;
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
+                        KMP_ERR(error), __kmp_msg_null);
+            KA_TRACE(10, ("__kmp_set_system_affinity:  Error binding "
+                          "T#%d to cpu=%d, errno=%d.\n",
+                          gtid, location, error));
+            return error;
+          }
+        }
+      }
+      return 0;
+    }
+#else // !KMP_OS_AIX
     int get_system_affinity(bool abort_on_error) override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                   "Illegal get affinity operation when not capable");
@@ -443,6 +511,7 @@ class KMPNativeAffinity : public KMPAffinity {
       }
       return error;
     }
+#endif // KMP_OS_AIX
   };
   void determine_capable(const char *env_var) override {
     __kmp_affinity_determine_capable(env_var);
@@ -471,7 +540,7 @@ class KMPNativeAffinity : public KMPAffinity {
   }
   api_type get_api_type() const override { return NATIVE_OS; }
 };
-#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
+#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX */
 
 #if KMP_OS_WINDOWS
 class KMPNativeAffinity : public KMPAffinity {
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 9eeaeb88fb9ec7..878e78b5c7ad2d 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -1533,8 +1533,9 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
   if (*lk == 0) {
     if (KMP_IS_D_LOCK(lockseq)) {
-      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
-                                  KMP_GET_D_TAG(lockseq));
+      KMP_COMPARE_AND_STORE_ACQ32(
+          (volatile kmp_int32 *)&((kmp_base_tas_lock_t *)crit)->poll, 0,
+          KMP_GET_D_TAG(lockseq));
     } else {
       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
     }
diff --git a/openmp/runtime/src/kmp_gsupport.cpp b/openmp/runtime/src/kmp_gsupport.cpp
index 88189659a23416..4dc8a90f83b4ea 100644
--- a/openmp/runtime/src/kmp_gsupport.cpp
+++ b/openmp/runtime/src/kmp_gsupport.cpp
@@ -144,7 +144,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER)(void) {
 
 // Mutual exclusion
 
-// The symbol that icc/ifort generates for unnamed for unnamed critical sections
+// The symbol that icc/ifort generates for unnamed critical sections
 // - .gomp_critical_user_ - is defined using .comm in any objects reference it.
 // We can't reference it directly here in C code, as the symbol contains a ".".
 //
diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp
index 85c54f4cdc7e96..0ad14f862bcb9b 100644
--- a/openmp/runtime/src/kmp_lock.cpp
+++ b/openmp/runtime/src/kmp_lock.cpp
@@ -2689,7 +2689,7 @@ void __kmp_spin_backoff(kmp_backoff_t *boff) {
 // lock word.
 static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
                                    kmp_dyna_lockseq_t seq) {
-  TCW_4(*lck, KMP_GET_D_TAG(seq));
+  TCW_4(((kmp_base_tas_lock_t *)lck)->poll, KMP_GET_D_TAG(seq));
   KA_TRACE(
       20,
       ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
@@ -3180,8 +3180,8 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
   lck->type = tag;
 
   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
-    *((kmp_lock_index_t *)user_lock) = idx
-                                       << 1; // indirect lock word must be even
+    *(kmp_lock_index_t *)&(((kmp_base_tas_lock_t *)user_lock)->poll) =
+        idx << 1; // indirect lock word must be even
   } else {
     *((kmp_indirect_lock_t **)user_lock) = lck;
   }
diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h
index f21179b4eb68a1..6202f3d617cc59 100644
--- a/openmp/runtime/src/kmp_lock.h
+++ b/openmp/runtime/src/kmp_lock.h
@@ -50,7 +50,7 @@ typedef struct ident ident_t;
 // recent versions), but we are bounded by the pointer-sized chunks that
 // the Intel compiler allocates.
 
-#if KMP_OS_LINUX && defined(KMP_GOMP_COMPAT)
+#if (KMP_OS_LINUX || KMP_OS_AIX) && defined(KMP_GOMP_COMPAT)
 #define OMP_LOCK_T_SIZE sizeof(int)
 #define OMP_NEST_LOCK_T_SIZE sizeof(void *)
 #else
@@ -120,8 +120,16 @@ extern void __kmp_validate_locks(void);
 
 struct kmp_base_tas_lock {
   // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) &&     \
+    __LP64__
+  // Flip the ordering of the high and low 32-bit member to be consistent
+  // with the memory layout of the address in 64-bit big-endian.
+  kmp_int32 depth_locked; // depth locked, for nested locks only
+  std::atomic<kmp_int32> poll;
+#else
   std::atomic<kmp_int32> poll;
   kmp_int32 depth_locked; // depth locked, for nested locks only
+#endif
 };
 
 typedef struct kmp_base_tas_lock kmp_base_tas_lock_t;
@@ -1138,11 +1146,13 @@ extern int (**__kmp_indirect_test)(kmp_user_lock_p, kmp_int32);
 
 // Extracts direct lock tag from a user lock pointer
 #define KMP_EXTRACT_D_TAG(l)                                                   \
-  (*((kmp_dyna_lock_t *)(l)) & ((1 << KMP_LOCK_SHIFT) - 1) &                   \
-   -(*((kmp_dyna_lock_t *)(l)) & 1))
+  ((kmp_dyna_lock_t)((kmp_base_tas_lock_t *)(l))->poll &                       \
+   ((1 << KMP_LOCK_SHIFT) - 1) &                                               \
+   -((kmp_dyna_lock_t)((kmp_tas_lock_t *)(l))->lk.poll & 1))
 
 // Extracts indirect lock index from a user lock pointer
-#define KMP_EXTRACT_I_INDEX(l) (*(kmp_lock_index_t *)(l) >> 1)
+#define KMP_EXTRACT_I_INDEX(l)                                                 \
+  ((kmp_lock_index_t)((kmp_base_tas_lock_t *)(l))->poll >> 1)
 
 // Returns function pointer to the direct lock function with l (kmp_dyna_lock_t
 // *) and op (operation type).
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index a0552dd930a62a..9cd0aefaea2dc0 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -75,7 +75,8 @@
 #error Unknown compiler
 #endif
 
-#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_OS_WASI
+#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD || KMP_OS_AIX) &&        \
+    !KMP_OS_WASI
 #define KMP_AFFINITY_SUPPORTED 1
 #if KMP_OS_WINDOWS && KMP_ARCH_X86_64
 #define KMP_GROUP_AFFINITY 1
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index d2157b10b7819a..ec86ee07472c1e 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -255,8 +255,13 @@ static void __kmp_stg_parse_bool(char const *name, char const *value,
 // placed here in order to use __kmp_round4k static function
 void __kmp_check_stksize(size_t *val) {
   // if system stack size is too big then limit the size for worker threads
+#if KMP_OS_AIX
+  if (*val > KMP_DEFAULT_STKSIZE * 2) // Use 2 times, 16 is too large for AIX.
+    *val = KMP_DEFAULT_STKSIZE * 2;
+#else
   if (*val > KMP_DEFAULT_STKSIZE * 16) // just a heuristics...
     *val = KMP_DEFAULT_STKSIZE * 16;
+#endif
   if (*val < __kmp_sys_min_stksize)
     *val = __kmp_sys_min_stksize;
   if (*val > KMP_MAX_STKSIZE)
diff --git a/openmp/runtime/src/z_AIX_asm.S b/openmp/runtime/src/z_AIX_asm.S
new file mode 100644
index 00000000000000..d711fcb7a7854f
--- /dev/null
+++ b/openmp/runtime/src/z_AIX_asm.S
@@ -0,0 +1,410 @@
+//  z_AIX_asm.S:  - microtasking routines specifically
+//                  written for Power platforms running AIX OS
+
+//
+////===----------------------------------------------------------------------===//
+////
+//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//// See https://llvm.org/LICENSE.txt for license information.
+//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+////
+////===----------------------------------------------------------------------===//
+//
+
+// -----------------------------------------------------------------------
+// macros
+// -----------------------------------------------------------------------
+
+#include "kmp_config.h"
+
+#if KMP_OS_AIX
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int *gtid, int *tid, ...),
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, p_argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = 0;
+// #endif
+//
+//   return 1;
+// }
+//
+// parameters:
+//   r3: pkfn
+//   r4: gtid
+//   r5: tid
+//   r6: argc
+//   r7: p_argv
+//   r8: &exit_frame
+//
+// return:  r3 (always 1/TRUE)
+//
+
+#if KMP_ARCH_PPC64_XCOFF
+
+    .globl  __kmp_invoke_microtask[DS]
+    .globl  .__kmp_invoke_microtask
+    .align  4
+    .csect __kmp_invoke_microtask[DS],3
+    .vbyte  8, .__kmp_invoke_microtask
+    .vbyte  8, TOC[TC0]
+    .vbyte  8, 0
+    .csect .text[PR],2
+    .machine "pwr7"
+.__kmp_invoke_microtask:
+
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+
+// We need to allocate a stack frame large enough to hold all of the parameters
+// on the stack for the microtask plus what this function needs. That's 48
+// bytes under the XCOFF64 ABI, plus max(64, 8*(2 + argc)) for
+// the parameters to the microtask (gtid, tid, argc elements of p_argv),
+// plus 8 bytes to store the values of r4 and r5, and 8 bytes to store r31.
+// With OMP-T support, we need an additional 8 bytes to save r30 to hold
+// a copy of r8.
+// Stack offsets relative to stack pointer:
+//   r31: -8, r30: -16, gtid: -20, tid: -24
+
+    mflr 0
+    std 31, -8(1)      # Save r31 to the stack
+    std 0, 16(1)       # Save LR to the linkage area
+
+// This is unusual because normally we'd set r31 equal to r1 after the stack
+// frame is established. In this case, however, we need to dynamically compute
+// the stack frame size, and so we keep a direct copy of r1 to access our
+// register save areas and restore the r1 value before returning.
+    mr 31, 1
+
+// Compute the size of the "argc" portion of the parameter save area.
+// The parameter save area is always at least 64 bytes long (i.e. 8 regs)
+// The microtask has (2 + argc) parameters, so if argc <= 6, we need to
+// to allocate 8*6 bytes, not 8*argc.
+    li 0, 6
+    cmpwi 0, 6, 6
+    iselgt 0, 6, 0     # r0 = (argc > 6)? argc : 6
+    sldi 0, 0, 3       # r0 = 8 * max(argc, 6)
+
+// Compute the size necessary for the local stack frame.
+// 88 = 48 + 4 (for r4) + 4 (for r5) + 8 (for r31) + 8 (for OMP-T r30) +
+//      8 (parameter gtid) + 8 (parameter tid)
+    li 12, 88
+    add 12, 0, 12
+    neg 12, 12
+
+// We need to make sure that the stack frame stays aligned (to 16 bytes).
+    li 0, -16
+    and 12, 0, 12
+
+// Establish the local stack frame.
+    stdux 1, 1, 12
+
+#if OMPT_SUPPORT
+    std 30, -16(31)    # Save r30 to the stack
+    std 1, 0(8)
+    mr 30, 8
+#endif
+
+// Store gtid and tid to the stack because they're passed by reference to the microtask.
+    stw 4, -20(31)     # Save gtid to the stack
+    stw 5, -24(31)     # Save tid to the stack
+
+    mr 12, 6           # r12 = argc
+    mr 4, 7            # r4 = p_argv
+
+    cmpwi 0, 12, 1
+    blt 0, .Lcall      # if (argc < 1) goto .Lcall
+
+    ld 5, 0(4)         # r5 = p_argv[0]
+
+    cmpwi 0, 12, 2
+    blt 0, .Lcall      # if (argc < 2) goto .Lcall
+
+    ld 6, 8(4)         # r6 = p_argv[1]
+
+    cmpwi 0, 12, 3
+    blt 0, .Lcall      # if (argc < 3) goto .Lcall
+
+    ld 7, 16(4)        # r7 = p_argv[2]
+
+    cmpwi 0, 12, 4
+    blt 0, .Lcall      # if (argc < 4) goto .Lcall
+
+    ld 8, 24(4)        # r8 = p_argv[3]
+
+    cmpwi 0, 12, 5
+    blt 0, .Lcall      # if (argc < 5) goto .Lcall
+
+    ld 9, 32(4)        # r9 = p_argv[4]
+
+    cmpwi 0, 12, 6
+    blt 0, .Lcall      # if (argc < 6) goto .Lcall
+
+    ld 10, 40(4)       # r10 = p_argv[5]
+
+    cmpwi 0, 12, 7
+    blt 0, .Lcall      # if (argc < 7) goto .Lcall
+
+// There are more than 6 microtask parameters, so we need to store the
+// remainder to the stack.
+    addi 12, 12, -6    # argc -= 6
+    mtctr 12
+
+// These are set to 8 bytes before the first desired store address (we're using
+// pre-increment loads and stores in the loop below). The parameter save area
+// for the microtask begins 48 + 8*8 == 112 bytes above r1 for XCOFF64.
+    addi 4, 4, 40      # p_argv = p_argv + 5
+                       # (i.e. skip the 5 elements we already processed)
+    addi 12, 1, 104    # r12 = stack offset (112 - 8)
+
+.Lnext:
+    ldu 0, 8(4)
+    stdu 0, 8(12)
+    bdnz .Lnext
+
+.Lcall:
+    std 2, 40(1)     # Save the TOC pointer to the linkage area
+// Load the actual function address from the function descriptor.
+    ld 12, 0(3)      # Function address
+    ld 2, 8(3)       # TOC pointer
+    ld 11, 16(3)     # Environment pointer
+
+    addi 3, 31, -20  # r3 = &gtid
+    addi 4, 31, -24  # r4 = &tid
+
+    mtctr 12         # CTR = function address
+    bctrl            # Branch to CTR
+    ld 2, 40(1)      # Restore TOC pointer from linkage area
+
+#if OMPT_SUPPORT
+    li 3, 0
+    std 3, 0(30)
+#endif
+
+    li 3, 1
+
+#if OMPT_SUPPORT
+    ld 30, -16(31)   # Restore r30 from the saved value on the stack
+#endif
+
+    mr 1, 31
+    ld 31, -8(1)     # Restore r31 from the saved value on the stack
+    ld 0, 16(1)
+    mtlr 0           # Restore LR from the linkage area
+    blr              # Branch to LR
+
+#else  // KMP_ARCH_PPC_XCOFF
+
+    .globl  __kmp_invoke_microtask[DS]
+    .globl  .__kmp_invoke_microtask
+    .align  4
+    .csect __kmp_invoke_microtask[DS],2
+    .vbyte  4, .__kmp_invoke_microtask
+    .vbyte  4, TOC[TC0]
+    .vbyte  4, 0
+    .csect .text[PR],2
+    .machine "pwr7"
+.__kmp_invoke_microtask:
+
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+
+// We need to allocate a stack frame large enough to hold all of the parameters
+// on the stack for the microtask plus what this function needs. That's 24
+// bytes under the XCOFF ABI, plus max(32, 8*(2 + argc)) for
+// the parameters to the microtask (gtid, tid, argc elements of p_argv),
+// plus 8 bytes to store the values of r4 and r5, and 4 bytes to store r31.
+// With OMP-T support, we need an additional 4 bytes to save r30 to hold
+// a copy of r8.
+// Stack offsets relative to stack pointer:
+//   r31: -4, r30: -8, gtid: -12, tid: -16
+
+    mflr 0
+    stw 31, -4(1)      # Save r31 to the stack
+    stw 0, 8(1)        # Save LR to the linkage area
+
+// This is unusual because normally we'd set r31 equal to r1 after the stack
+// frame is established. In this case, however, we need to dynamically compute
+// the stack frame size, and so we keep a direct copy of r1 to access our
+// register save areas and restore the r1 value before returning.
+    mr 31, 1
+
+// Compute the size of the "argc" portion of the parameter save area.
+// The parameter save area is always at least 32 bytes long (i.e. 8 regs)
+// The microtask has (2 + argc) parameters, so if argc <= 6, we need to
+// to allocate 4*6 bytes, not 4*argc.
+    li 0, 6
+    cmpwi 0, 6, 6
+    iselgt 0, 6, 0     # r0 = (argc > 6)? argc : 6
+    slwi 0, 0, 2       # r0 = 4 * max(argc, 6)
+
+// Compute the size necessary for the local stack frame.
+// 56 = 32 + 4 (for r4) + 4 (for r5) + 4 (for r31) + 4 (for OMP-T r30) +
+//      4 (parameter gtid) + 4 (parameter tid)
+    li 12, 56
+    add 12, 0, 12
+    neg 12, 12
+
+// We need to make sure that the stack frame stays aligned (to 16 bytes).
+    li 0, -16
+    and 12, 0, 12
+
+// Establish the local stack frame.
+    stwux 1, 1, 12
+
+#if OMPT_SUPPORT
+    stw 30, -8(31)     # Save r30 to the stack
+    stw 1, 0(8)
+    mr 30, 8
+#endif
+
+// Store gtid and tid to the stack because they're passed by reference to the microtask.
+    stw 4, -12(31)     # Save gtid to the stack
+    stw 5, -16(31)     # Save tid to the stack
+
+    mr 12, 6           # r12 = argc
+    mr 4, 7            # r4 = p_argv
+
+    cmpwi 0, 12, 1
+    blt 0, .Lcall      # if (argc < 1) goto .Lcall
+
+    lwz 5, 0(4)        # r5 = p_argv[0]
+
+    cmpwi 0, 12, 2
+    blt 0, .Lcall      # if (argc < 2) goto .Lcall
+
+    lwz 6, 4(4)        # r6 = p_argv[1]
+
+    cmpwi 0, 12, 3
+    blt 0, .Lcall      # if (argc < 3) goto .Lcall
+
+    lwz 7, 8(4)        # r7 = p_argv[2]
+
+    cmpwi 0, 12, 4
+    blt 0, .Lcall      # if (argc < 4) goto .Lcall
+
+    lwz 8, 12(4)       # r8 = p_argv[3]
+
+    cmpwi 0, 12, 5
+    blt 0, .Lcall      # if (argc < 5) goto .Lcall
+
+    lwz 9, 16(4)       # r9 = p_argv[4]
+
+    cmpwi 0, 12, 6
+    blt 0, .Lcall      # if (argc < 6) goto .Lcall
+
+    lwz 10, 20(4)      # r10 = p_argv[5]
+
+    cmpwi 0, 12, 7
+    blt 0, .Lcall      # if (argc < 7) goto .Lcall
+
+// There are more than 6 microtask parameters, so we need to store the
+// remainder to the stack.
+    addi 12, 12, -6    # argc -= 6
+    mtctr 12
+
+// These are set to 4 bytes before the first desired store address (we're using
+// pre-increment loads and stores in the loop below). The parameter save area
+// for the microtask begins 24 + 4*8 == 56 bytes above r1 for XCOFF.
+    addi 4, 4, 20      # p_argv = p_argv + 5
+                       # (i.e. skip the 5 elements we already processed)
+    addi 12, 1, 52     # r12 = stack offset (56 - 4)
+
+.Lnext:
+    lwzu 0, 4(4)
+    stwu 0, 4(12)
+    bdnz .Lnext
+
+.Lcall:
+    stw 2, 20(1)     # Save the TOC pointer to the linkage area
+// Load the actual function address from the function descriptor.
+    lwz 12, 0(3)     # Function address
+    lwz 2, 4(3)      # TOC pointer
+    lwz 11, 8(3)     # Environment pointer
+
+    addi 3, 31, -12  # r3 = &gtid
+    addi 4, 31, -16  # r4 = &tid
+
+    mtctr 12         # CTR = function address
+    bctrl            # Branch to CTR
+    lwz 2, 20(1)     # Restore TOC pointer from linkage area
+
+#if OMPT_SUPPORT
+    li 3, 0
+    stw 3, 0(30)
+#endif
+
+    li 3, 1
+
+#if OMPT_SUPPORT
+    lwz 30, -8(31)   # Restore r30 from the saved value on the stack
+#endif
+
+    mr 1, 31
+    lwz 31, -4(1)    # Restore r31 from the saved value on the stack
+    lwz 0, 8(1)
+    mtlr 0           # Restore LR from the linkage area
+    blr              # Branch to LR
+
+#endif // KMP_ARCH_PPC64_XCOFF
+
+.Lfunc_end0:
+    .vbyte  4, 0x00000000           # Traceback table begin
+    .byte   0x00                    # Version = 0
+    .byte   0x09                    # Language = CPlusPlus
+    .byte   0x20                    # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue
+                                    # +HasTraceBackTableOffset, -IsInternalProcedure
+                                    # -HasControlledStorage, -IsTOCless
+                                    # -IsFloatingPointPresent
+                                    # -IsFloatingPointOperationLogOrAbortEnabled
+    .byte   0x61                    # -IsInterruptHandler, +IsFunctionNamePresent, +IsAllocaUsed
+                                    # OnConditionDirective = 0, -IsCRSaved, +IsLRSaved
+    .byte   0x80                    # +IsBackChainStored, -IsFixup, NumOfFPRsSaved = 0
+#if OMPT_SUPPORT
+    .byte   0x02                    # -HasExtensionTable, -HasVectorInfo, NumOfGPRsSaved = 2
+    .byte   0x06                    # NumberOfFixedParms = 6
+#else
+    .byte   0x01                    # -HasExtensionTable, -HasVectorInfo, NumOfGPRsSaved = 1
+    .byte   0x05                    # NumberOfFixedParms = 5
+#endif
+    .byte   0x01                    # NumberOfFPParms = 0, +HasParmsOnStack
+    .vbyte  4, 0x00000000           # Parameter type = i, i, i, i, i
+    .vbyte  4, .Lfunc_end0-.__kmp_invoke_microtask # Function size
+    .vbyte  2, 0x0016               # Function name len = 22
+    .byte   "__kmp_invoke_microtask" # Function Name
+    .byte   0x1f                    # AllocaRegister = 31
+                                    # -- End function
+
+// -- End  __kmp_invoke_microtask
+
+// Support for unnamed common blocks.
+
+    .comm .gomp_critical_user_, 32, 3
+#if KMP_ARCH_PPC64_XCOFF
+    .csect __kmp_unnamed_critical_addr[RW],3
+#else
+    .csect __kmp_unnamed_critical_addr[RW],2
+#endif
+    .globl __kmp_unnamed_critical_addr[RW]
+    .ptr .gomp_critical_user_
+
+// -- End unnamed common block
+
+    .toc
+
+#endif // KMP_OS_AIX
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 513ec6517d00bd..b9ff96873702b1 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -116,7 +116,7 @@ static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) {
 }
 #endif
 
-#if ((KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED)
+#if ((KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX) && KMP_AFFINITY_SUPPORTED)
 
 /* Affinity support */
 
@@ -132,6 +132,29 @@ void __kmp_affinity_bind_thread(int which) {
   KMP_CPU_FREE_FROM_STACK(mask);
 }
 
+#if KMP_OS_AIX
+void __kmp_affinity_determine_capable(const char *env_var) {
+  // All versions of AIX support bindprocessor().
+
+  size_t mask_size = __kmp_xproc / CHAR_BIT;
+  // Round up to byte boundary.
+  if (__kmp_xproc % CHAR_BIT)
+    ++mask_size;
+
+  // Round up to the mask_size_type boundary.
+  if (mask_size % sizeof(__kmp_affin_mask_size))
+    mask_size += sizeof(__kmp_affin_mask_size) -
+                 mask_size % sizeof(__kmp_affin_mask_size);
+  KMP_AFFINITY_ENABLE(mask_size);
+  KA_TRACE(10,
+           ("__kmp_affinity_determine_capable: "
+            "AIX OS affinity interface bindprocessor functional (mask size = "
+            "%" KMP_SIZE_T_SPEC ").\n",
+            __kmp_affin_mask_size));
+}
+
+#else // !KMP_OS_AIX
+
 /* Determine if we can access affinity functionality on this version of
  * Linux* OS by checking __NR_sched_{get,set}affinity system calls, and set
  * __kmp_affin_mask_size to the appropriate value (0 means not capable). */
@@ -259,8 +282,9 @@ void __kmp_affinity_determine_capable(const char *env_var) {
     KMP_WARNING(AffCantGetMaskSize, env_var);
   }
 }
-
-#endif // KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+#endif // KMP_OS_AIX
+#endif // (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX) &&
+       // KMP_AFFINITY_SUPPORTED
 
 #if KMP_USE_FUTEX
 
@@ -476,7 +500,7 @@ static void *__kmp_launch_worker(void *thr) {
 #endif /* KMP_BLOCK_SIGNALS */
   void *exit_val;
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS
+    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
   void *volatile padding = 0;
 #endif
   int gtid;
@@ -525,7 +549,7 @@ static void *__kmp_launch_worker(void *thr) {
 #endif /* KMP_BLOCK_SIGNALS */
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS
+    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
   if (__kmp_stkoffset > 0 && gtid > 0) {
     padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
     (void)padding;
@@ -1245,7 +1269,7 @@ static void __kmp_atfork_child(void) {
   ++__kmp_fork_count;
 
 #if KMP_AFFINITY_SUPPORTED
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX
   // reset the affinity in the child to the initial thread
   // affinity in the parent
   kmp_set_thread_affinity_mask_initial();
@@ -2214,6 +2238,7 @@ int __kmp_is_address_mapped(void *addr) {
   found = (int)addr < (__builtin_wasm_memory_size(0) * PAGESIZE);
 #elif KMP_OS_DRAGONFLY || KMP_OS_SOLARIS || KMP_OS_AIX
 
+  (void)rc;
   // FIXME(DragonFly, Solaris, AIX): Implement this
   found = 1;
 
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index 4a457f4cc41f75..2126928bb411a9 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -129,7 +129,7 @@ if config.operating_system == 'NetBSD':
 if config.operating_system == 'Darwin':
     config.available_features.add("darwin")
 
-if config.operating_system in ['Linux', 'Windows']:
+if config.operating_system in ['Linux', 'Windows', 'AIX']:
     config.available_features.add('affinity')
 
 if config.operating_system in ['Linux']:
diff --git a/openmp/runtime/test/tasking/bug_nested_proxy_task.c b/openmp/runtime/test/tasking/bug_nested_proxy_task.c
index 43502bdcd1abd1..9e0b412efce609 100644
--- a/openmp/runtime/test/tasking/bug_nested_proxy_task.c
+++ b/openmp/runtime/test/tasking/bug_nested_proxy_task.c
@@ -50,12 +50,21 @@ typedef struct kmp_depend_info {
      union {
         kmp_uint8 flag; // flag as an unsigned char
         struct { // flag as a set of 8 bits
-            unsigned in : 1;
-            unsigned out : 1;
-            unsigned mtx : 1;
-            unsigned set : 1;
-            unsigned unused : 3;
-            unsigned all : 1;
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+          unsigned all : 1;
+          unsigned unused : 3;
+          unsigned set : 1;
+          unsigned mtx : 1;
+          unsigned out : 1;
+          unsigned in : 1;
+#else
+          unsigned in : 1;
+          unsigned out : 1;
+          unsigned mtx : 1;
+          unsigned set : 1;
+          unsigned unused : 3;
+          unsigned all : 1;
+#endif
         } flags;
      };
 } kmp_depend_info_t;
diff --git a/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c b/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c
index ff75df51aff077..1e86d574f4f6a8 100644
--- a/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c
+++ b/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c
@@ -47,12 +47,21 @@ typedef struct kmp_depend_info {
      union {
         kmp_uint8 flag; // flag as an unsigned char
         struct { // flag as a set of 8 bits
-            unsigned in : 1;
-            unsigned out : 1;
-            unsigned mtx : 1;
-            unsigned set : 1;
-            unsigned unused : 3;
-            unsigned all : 1;
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+          unsigned all : 1;
+          unsigned unused : 3;
+          unsigned set : 1;
+          unsigned mtx : 1;
+          unsigned out : 1;
+          unsigned in : 1;
+#else
+          unsigned in : 1;
+          unsigned out : 1;
+          unsigned mtx : 1;
+          unsigned set : 1;
+          unsigned unused : 3;
+          unsigned all : 1;
+#endif
         } flags;
     };
 } kmp_depend_info_t;
diff --git a/openmp/runtime/test/tasking/hidden_helper_task/common.h b/openmp/runtime/test/tasking/hidden_helper_task/common.h
index 402ecf3ed553c9..68e2b584c87739 100644
--- a/openmp/runtime/test/tasking/hidden_helper_task/common.h
+++ b/openmp/runtime/test/tasking/hidden_helper_task/common.h
@@ -17,9 +17,21 @@ typedef struct kmp_depend_info {
   union {
     unsigned char flag;
     struct {
-      bool in : 1;
-      bool out : 1;
-      bool mtx : 1;
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+      unsigned all : 1;
+      unsigned unused : 3;
+      unsigned set : 1;
+      unsigned mtx : 1;
+      unsigned out : 1;
+      unsigned in : 1;
+#else
+      unsigned in : 1;
+      unsigned out : 1;
+      unsigned mtx : 1;
+      unsigned set : 1;
+      unsigned unused : 3;
+      unsigned all : 1;
+#endif
     } flags;
   };
 } kmp_depend_info_t;
diff --git a/polly/lib/Analysis/DependenceInfo.cpp b/polly/lib/Analysis/DependenceInfo.cpp
index 69257c603877ea..d58dc9917bc91f 100644
--- a/polly/lib/Analysis/DependenceInfo.cpp
+++ b/polly/lib/Analysis/DependenceInfo.cpp
@@ -950,8 +950,8 @@ class DependenceInfoPrinterLegacyPass final : public ScopPass {
   bool runOnScop(Scop &S) override {
     DependenceInfo &P = getAnalysis<DependenceInfo>();
 
-    OS << "Printing analysis '" << P.getPassName() << "' for " << "region: '"
-       << S.getRegion().getNameStr() << "' in function '"
+    OS << "Printing analysis '" << P.getPassName() << "' for "
+       << "region: '" << S.getRegion().getNameStr() << "' in function '"
        << S.getFunction().getName() << "':\n";
     P.printScop(OS, S);
 
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index c62cb2a85c835c..64314d6041b8e9 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -2689,9 +2689,10 @@ void ScopBuilder::addUserContext() {
     if (NameContext != NameUserContext) {
       std::string SpaceStr = stringFromIslObj(Space, "null");
       errs() << "Error: the name of dimension " << i
-             << " provided in -polly-context " << "is '" << NameUserContext
-             << "', but the name in the computed " << "context is '"
-             << NameContext << "'. Due to this name mismatch, "
+             << " provided in -polly-context "
+             << "is '" << NameUserContext << "', but the name in the computed "
+             << "context is '" << NameContext
+             << "'. Due to this name mismatch, "
              << "the -polly-context option is ignored. Please provide "
              << "the context in the parameter space: " << SpaceStr << ".\n";
       return;
diff --git a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
index 364e21aef207ce..58c2b4fedc478d 100644
--- a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
+++ b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
@@ -45,7 +45,7 @@ using namespace llvm;
 #define DEBUG_TYPE "polly-detect"
 
 #define SCOP_STAT(NAME, DESC)                                                  \
-  { "polly-detect", "NAME", "Number of rejected regions: " DESC }
+  {"polly-detect", "NAME", "Number of rejected regions: " DESC}
 
 static Statistic RejectStatistics[] = {
     SCOP_STAT(CFG, ""),
diff --git a/polly/lib/Exchange/JSONExporter.cpp b/polly/lib/Exchange/JSONExporter.cpp
index 74d4e6c7993fa3..63fb06a634cc12 100644
--- a/polly/lib/Exchange/JSONExporter.cpp
+++ b/polly/lib/Exchange/JSONExporter.cpp
@@ -842,7 +842,7 @@ class JSONImporterPrinterLegacyPass final : public ScopPass {
 public:
   static char ID;
 
-  JSONImporterPrinterLegacyPass() : JSONImporterPrinterLegacyPass(outs()){};
+  JSONImporterPrinterLegacyPass() : JSONImporterPrinterLegacyPass(outs()) {}
   explicit JSONImporterPrinterLegacyPass(llvm::raw_ostream &OS)
       : ScopPass(ID), OS(OS) {}
 
diff --git a/polly/lib/Transform/DeLICM.cpp b/polly/lib/Transform/DeLICM.cpp
index 51e701346563a1..dae5e79639f7be 100644
--- a/polly/lib/Transform/DeLICM.cpp
+++ b/polly/lib/Transform/DeLICM.cpp
@@ -1463,7 +1463,7 @@ class DeLICMPrinterLegacyPass final : public ScopPass {
 public:
   static char ID;
 
-  DeLICMPrinterLegacyPass() : DeLICMPrinterLegacyPass(outs()){};
+  DeLICMPrinterLegacyPass() : DeLICMPrinterLegacyPass(outs()) {}
   explicit DeLICMPrinterLegacyPass(llvm::raw_ostream &OS)
       : ScopPass(ID), OS(OS) {}
 
diff --git a/polly/lib/Transform/FlattenSchedule.cpp b/polly/lib/Transform/FlattenSchedule.cpp
index 53e230be7a6945..87bf642ba0d92b 100644
--- a/polly/lib/Transform/FlattenSchedule.cpp
+++ b/polly/lib/Transform/FlattenSchedule.cpp
@@ -103,7 +103,7 @@ class FlattenSchedulePrinterLegacyPass final : public ScopPass {
   static char ID;
 
   FlattenSchedulePrinterLegacyPass()
-      : FlattenSchedulePrinterLegacyPass(outs()){};
+      : FlattenSchedulePrinterLegacyPass(outs()) {}
   explicit FlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS)
       : ScopPass(ID), OS(OS) {}
 
diff --git a/polly/lib/Transform/ForwardOpTree.cpp b/polly/lib/Transform/ForwardOpTree.cpp
index 2bed3e35412d76..5e6de2e182a526 100644
--- a/polly/lib/Transform/ForwardOpTree.cpp
+++ b/polly/lib/Transform/ForwardOpTree.cpp
@@ -1149,7 +1149,7 @@ class ForwardOpTreePrinterLegacyPass final : public ScopPass {
 public:
   static char ID;
 
-  ForwardOpTreePrinterLegacyPass() : ForwardOpTreePrinterLegacyPass(outs()){};
+  ForwardOpTreePrinterLegacyPass() : ForwardOpTreePrinterLegacyPass(outs()) {}
   explicit ForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS)
       : ScopPass(ID), OS(OS) {}